1 /* Copyright (c) 1991 Sun Wu and Udi Manber.  All Rights Reserved. */
2 #include "agrep.h"
3 #include "checkfile.h"
4 #include <unistd.h>
5 #include <fcntl.h>
6 
7 int exponen(int m);
8 void r_output (CHAR *buffer, int i, int end, int j);
9 void file_out(char *fname);
10 void usage(void);
11 void checksg(CHAR *Pattern, int D);
12 
13 unsigned Mask[MAXSYM];
14 unsigned Init1, NO_ERR_MASK, Init[MaxError];
15 unsigned Bit[WORD+1];
16 CHAR buffer[BlockSize+Maxline+1];
17 unsigned Next[MaxNext], Next1[MaxNext];
18 unsigned wildmask, endposition, D_endpos;
19 int  REGEX, RE_ERR, FNAME, WHOLELINE, SIMPLEPATTERN;
20 int  COUNT, HEAD, TAIL, LINENUM, INVERSE, I, S, DD, AND, SGREP, JUMP;
21 int  Num_Pat, PSIZE, num_of_matched, SILENT, NOPROMPT, BESTMATCH, NOUPPER;
22 int  NOMATCH, TRUNCATE, FIRST_IN_RE, FIRSTOUTPUT;
23 int  WORDBOUND, DELIMITER, D_length;
24 int  EATFIRST, OUTTAIL;
25 int  FILEOUT;
26 int  DNA = 0;
27 int  APPROX = 0;
28 int  PAT_FILE = 0;
29 int  CONSTANT = 0;
30 int total_line = 0; /* used in mgrep */
31 
32 CHAR **Textfiles;     /* array of filenames to be searched */
33 CHAR old_D_pat[MaxDelimit] = "\n";  /* to hold original D_pattern */
34 CHAR CurrentFileName[MAXNAME];
35 CHAR Progname[MAXNAME];
36 CHAR D_pattern[MaxDelimit] = "\n; "; /* string which delimits records --
37                                         defaults to newline */
38 int  NOFILENAME = 0,  /* Boolean flag, set for -h option */
39      FILENAMEONLY = 0,/* Boolean flag, set for -l option */
40      Numfiles = 0;    /* indicates how many files in Textfiles */
41 extern int init();
42 int table[WORD][WORD];
43 
initial_value()44 void initial_value()
45 {
46    int i;
47 
48    JUMP = REGEX = FNAME = BESTMATCH = NOPROMPT = NOUPPER = 0;
49    COUNT = LINENUM = WHOLELINE = SGREP = 0;
50    EATFIRST = INVERSE = AND = TRUNCATE = OUTTAIL = 0;
51    FIRST_IN_RE = NOMATCH = FIRSTOUTPUT = ON;
52    I = DD = S = 1;
53    HEAD = TAIL = ON;
54    D_length = 2;
55    SILENT = Num_Pat = PSIZE = SIMPLEPATTERN = num_of_matched = 0 ;
56    WORDBOUND = DELIMITER = RE_ERR = 0;
57    Bit[WORD] = 1;
58    for (i = WORD - 1; i > 0  ; i--)  Bit[i] = Bit[i+1] << 1;
59    for (i=0; i< MAXSYM; i++) Mask[i] = 0;
60 }
61 
compute_next(M,Next,Next1)62 void compute_next(M, Next, Next1)
63 int M; unsigned *Next, *Next1;
64 {
65   int i, j=0, n,  k, temp;
66   int mid, pp;
67   int MM, base;
68   unsigned V[WORD];
69 
70   base = WORD - M;
71   temp = Bit[base]; Bit[base] = 0;
72   for (i=0; i<WORD; i++) V[i] = 0;
73   for (i=1; i<M; i++)
74   {
75       j=0;
76       while (table[i][j] > 0 && j < 10) {
77             V[i] = V[i] | Bit[base + table[i][j++]];
78       }
79   }
80   Bit[base]=temp;
81   if(M <= SHORTREG)
82   {
83     k = exponen(M);
84     pp = 2*k;
85     for(i=k; i<pp ; i++)
86     {   n = i;
87         Next[i]= (k>>1);
88         for(j=M; j>=1; j--)
89         {
90            if(n & Bit[WORD]) Next[i] = Next[i] | V[j];
91            n = (n>>1);
92         }
93     }
94     return;
95   }
96   if(M > MAXREG) fprintf(stderr, "%s: regular expression too long\n", Progname);
97   MM = M;
98   if(M & 1) M=M+1;
99   k = exponen(M/2);
100   pp = 2*k;
101   mid = MM/2;
102   for(i=k; i<pp ; i++)
103   {     n = i;
104         Next[i]= (Bit[base]>>1);
105         for(j=MM; j>mid ; j--)
106         {
107            if(n & Bit[WORD]) Next[i] = Next[i] | V[j-mid];
108            n = (n>>1);
109         }
110         n=i-k;
111         Next1[i-k] = 0;
112         for(j = 0; j<mid; j++)
113         {
114            if(n & Bit[WORD]) Next1[i-k] = Next1[i-k] | V[MM-j];
115            n = (n>>1);
116         }
117   }
118   return;
119 }
120 
exponen(m)121 int exponen(m)
122 int m;
123 { int i, ex;
124   ex= 1;
125   for (i=0; i<m; i++) ex= ex*2;
126   return(ex);
127 }
128 
re1(Text,M,D)129 void re1(Text, M, D)
130 int Text, M, D;
131 {
132   register unsigned i, c, r0, r1, r2, r3, CMask, Newline, Init0, r_NO_ERR;
133   register unsigned end;
134   register unsigned hh, LL=0, k;  /* Lower part */
135   int  FIRST_TIME=ON, num_read , j=0, base;
136   unsigned A[MaxRerror+1], B[MaxRerror+1];
137   unsigned Next[MaxNext], Next1[MaxNext];
138   CHAR buffer[BlockSize+Maxline+1];
139   int FIRST_LOOP = 1;
140 
141   r_NO_ERR = NO_ERR_MASK;
142   if(M > 30) {
143      fprintf(stderr, "%s: regular expression too long\n", Progname);
144      exit(2);
145   }
146   base = WORD - M;
147   hh = M/2;
148   for(i=WORD, j=0; j < hh ; i--, j++) LL = LL | Bit[i];
149   if(FIRST_IN_RE) compute_next(M, Next, Next1);
150                                    /*SUN: try: change to memory allocation */
151   FIRST_IN_RE = 0;
152   Newline = '\n';
153   Init[0] = Bit[base];
154   if(HEAD) Init[0] = Init[0] | Bit[base+1];
155   for(i=1; i<= D; i++) Init[i] = Init[i-1] | Next[Init[i-1]>>hh] | Next1[Init[i-1]&LL];
156   Init1 = Init[0] | 1;
157   Init0 = Init[0];
158   r2 = r3 = Init[0];
159   for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
160   if ( D == 0 )
161   {
162     while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
163     {
164       i=Maxline; end = num_read + Maxline;
165       if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
166       if(FIRST_LOOP) {         /* if first time in the loop add a newline */
167         buffer[i-1] = '\n';  /* in front the  text.  */
168 	i--;
169         FIRST_LOOP = 0;
170       }
171       while ( i < end )
172       {
173         c = buffer[i++];
174         CMask = Mask[c];
175         if(c != Newline)
176         {  if(CMask != 0) {
177               r1 = Init1 & r3;
178               r2 = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | r1;
179            }
180 	   else  {
181               r2 = r3 & Init1;
182            }
183         }
184         else {  j++;
185               r1 = Init1 & r3;            /* match against endofline */
186               r2 = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | r1;
187               if(TAIL) r2 = (Next[r2>>hh] | Next1[r2&LL]) | r2;                                        /* epsilon move */
188               if(( r2 & 1 ) ^ INVERSE) {
189                    if(FILENAMEONLY) {
190                           num_of_matched++;
191                           printf("%s\n", CurrentFileName);
192                           return;
193                    }
194                    r_output(buffer, i-1, end, j);
195               }
196               r3 = Init0;
197               r2 = (Next[r3>>hh] | Next1[r3&LL]) & CMask | Init0;
198                                                /* match begin of line */
199         }
200         c = buffer[i++];
201         CMask = Mask[c];
202         if(c != Newline)
203         {  if(CMask != 0) {
204               r1 = Init1 & r2;
205               r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
206            }
207 	   else   r3 = r2 & Init1;
208         } /* if(NOT Newline) */
209         else {  j++;
210               r1 = Init1 & r2;            /* match against endofline */
211               r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
212               if(TAIL) r3 = ( Next[r3>>hh] | Next1[r3&LL] ) | r3;
213                                            /* epsilon move */
214               if(( r3 & 1 ) ^ INVERSE) {
215                    if(FILENAMEONLY) {
216                           num_of_matched++;
217                           printf("%s\n", CurrentFileName);
218                           return;
219                    }
220                    r_output(buffer, i-1, end, j);
221               }
222               r2 = Init0;
223               r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | Init0;
224                    /* match begin of line */
225         }
226       } /* while i < end ... */
227       strncpy(buffer, buffer+num_read, Maxline);
228     } /* end while read()... */
229   return;
230   } /*  end if (D == 0) */
231   while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
232   {
233     i=Maxline; end = Maxline + num_read;
234     if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
235     if(FIRST_TIME) {         /* if first time in the loop add a newline */
236         buffer[i-1] = '\n';  /* in front the  text.  */
237 	i--;
238         FIRST_TIME = 0;
239     }
240     while (i < end )
241     {
242         c = buffer[i];
243         CMask = Mask[c];
244         if(c !=  Newline)
245         {
246            if(CMask != 0) {
247               r2 = B[0];
248               r1 = Init1 & r2;
249               A[0] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
250               r3 = B[1];
251               r1 = Init1 & r3;
252               r0 = r2 | A[0];     /* A[0] | B[0] */
253               A[1] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       (( r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
254                      if(D == 1) goto Nextchar;
255               r2 = B[2];
256               r1 = Init1 & r2;
257               r0 = r3 | A[1];
258               A[2] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) |                                       ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
259                      if(D == 2) goto Nextchar;
260               r3 = B[3];
261               r1 = Init1 & r3;
262               r0 = r2 | A[2];
263               A[3] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
264                      if(D == 3) goto Nextchar;
265               r2 = B[4];
266               r1 = Init1 & r2;
267               r0 = r3 | A[3];
268               A[4] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) |                                        ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
269                      if(D == 4)  goto Nextchar;
270            }  /* if(CMask) */
271 	   else  {
272               r2 = B[0];
273               A[0] = r2 & Init1;
274               r3 = B[1];
275               r1 = Init1 & r3;
276               r0 = r2 | A[0];
277               A[1] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
278                    if(D == 1) goto Nextchar;
279               r2 = B[2];
280               r1 = Init1 & r2;
281               r0 = r3 | A[1];
282               A[2] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
283                    if(D == 2) goto Nextchar;
284               r3 = B[3];
285               r1 = Init1 & r3;
286               r0 = r2 | A[2];
287               A[3] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
288                    if(D == 3) goto Nextchar;
289               r2 = B[4];
290               r1 = Init1 & r2;
291               r0 = r3 | A[3];
292               A[4] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
293                    if(D == 4) goto Nextchar;
294            }
295         }
296         else {  j++;
297               r1 = Init1 & B[D];            /* match against endofline */
298               A[D] = ((Next[B[D]>>hh] | Next1[B[D]&LL]) & CMask) | r1;
299               if(TAIL) A[D] = ( Next[A[D]>>hh] | Next1[A[D]&LL] ) | A[D];
300                                            /* epsilon move */
301               if(( A[D] & 1 ) ^ INVERSE) {
302                    if(FILENAMEONLY) {
303                           num_of_matched++;
304                           printf("%s\n", CurrentFileName);
305                           return;
306                    }
307                    r_output(buffer, i, end, j);
308               }
309               for(k=0; k<=D; k++)  B[k] = Init[0];
310               r1 = Init1 & B[0];
311               A[0] = (( Next[B[0]>>hh] | Next1[B[0]&LL]) & CMask) | r1;
312               for(k=1; k<=D; k++) {
313                    r3 = B[k];
314                    r1 = Init1 & r3;
315                    r2 = A[k-1] | B[k-1];
316                    A[k] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       ((B[k-1] | Next[r2>>hh] | Next1[r2&LL]) & r_NO_ERR) | r1;
317               }
318         }
319 Nextchar: i=i+1;
320         c = buffer[i];
321         CMask = Mask[c];
322         if(c != Newline)
323         {
324            if(CMask != 0) {
325               r2 = A[0];
326               r1 = Init1 & r2;
327               B[0] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
328               r3 = A[1];
329               r1 = Init1 & r3;
330               r0 = B[0] | r2;
331               B[1] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       ((r2 | Next[r0>>hh] | Next1[r0&LL]) & r_NO_ERR) | r1 ;
332                      if(D == 1) goto Nextchar1;
333               r2 = A[2];
334               r1 = Init1 & r2;
335               r0 = B[1] | r3;
336               B[2] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) |                                   ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
337                      if(D == 2) goto Nextchar1;
338               r3 = A[3];
339               r1 = Init1 & r3;
340               r0 = B[2] | r2;
341               B[3] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
342                      if(D == 3) goto Nextchar1;
343               r2 = A[4];
344               r1 = Init1 & r2;
345               r0 = B[3] | r3;
346               B[4] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) |                                       ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
347                      if(D == 4)   goto Nextchar1;
348            }  /* if(CMask) */
349 	   else  {
350               r2 = A[0];
351               B[0] = r2 & Init1;
352               r3 = A[1];
353               r1 = Init1 & r3;
354               r0 = B[0] | r2;
355               B[1] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
356                    if(D == 1) goto Nextchar1;
357               r2 = A[2];
358               r1 = Init1 & r2;
359               r0 = B[1] | r3;
360               B[2] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
361                    if(D == 2) goto Nextchar1;
362               r3 = A[3];
363               r1 = Init1 & r3;
364               r0 = B[2] | r2;
365               B[3] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
366                    if(D == 3) goto Nextchar1;
367               r2 = A[4];
368               r1 = Init1 & r2;
369               r0 = B[3] | r3;
370               B[4] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
371                    if(D == 4) goto Nextchar1;
372            }
373         } /* if(NOT Newline) */
374         else {  j++;
375               r1 = Init1 & A[D];            /* match against endofline */
376               B[D] = ((Next[A[D]>>hh] | Next1[A[D]&LL]) & CMask) | r1;
377               if(TAIL) B[D] = ( Next[B[D]>>hh] | Next1[B[D]&LL] ) | B[D];
378                                            /* epsilon move */
379               if(( B[D] & 1 ) ^ INVERSE) {
380                    if(FILENAMEONLY) {
381                           num_of_matched++;
382                           printf("%s\n", CurrentFileName);
383                           return;
384                    }
385                    r_output(buffer, i, end, j);
386               }
387               for(k=0; k<=D; k++) A[k] = Init0;
388               r1 = Init1 & A[0];
389               B[0] = ((Next[A[0]>>hh] | Next1[A[0]&LL]) & CMask) | r1;
390               for(k=1; k<=D; k++) {
391                    r3 = A[k];
392                    r1 = Init1 & r3;
393                    r2 = A[k-1] | B[k-1];
394                    B[k] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) |                                       ((A[k-1] | Next[r2>>hh] | Next1[r2&LL]) & r_NO_ERR) | r1;
395               }
396         }
397 Nextchar1: i=i+1;
398     } /* while */
399     strncpy(buffer, buffer+num_read, Maxline);
400   } /* while */
401   return;
402 } /* re1 */
403 
re(Text,M,D)404 void re(Text, M, D)
405 int Text, M, D;
406 {
407   register unsigned i, c, r1, r2, r3, CMask, k, Newline, Init0, Init1, end;
408   register unsigned r_even, r_odd, r_NO_ERR ;
409   unsigned RMask[MAXSYM];
410   unsigned A[MaxRerror+1], B[MaxRerror+1];
411   int num_read, j=0, bp, lasti, base, ResidueSize;
412   int FIRST_TIME; /* Flag */
413   base = WORD - M;
414   k = 2*exponen(M);
415   if(FIRST_IN_RE) {
416          compute_next(M, Next, Next1);
417          FIRST_IN_RE = 0;    }
418   for(i=0; i< MAXSYM; i++) RMask[i] = Mask[i];
419   r_NO_ERR = NO_ERR_MASK;
420   Newline = '\n';
421   lasti = Maxline;
422   Init0 = Init[0] = Bit[base];
423   if(HEAD) Init0  = Init[0] = Init0 | Bit[base+1] ;
424   for(i=1; i<= D; i++) Init[i] = Init[i-1] | Next[Init[i-1]]; /* can be out? */
425   Init1 = Init0 | 1;
426   r2 = r3 = Init0;
427   for(k=0; k<= D; k++) { A[k] = B[k] = Init[0]; }  /* can be out? */
428   FIRST_TIME = ON;
429   if ( D == 0 )
430   {
431     while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
432     {
433       i=Maxline; end = Maxline + num_read ;
434       if((num_read < BlockSize)&&buffer[end-1] != '\n') buffer[end] = '\n';
435       if(FIRST_TIME) {
436          buffer[i-1] = '\n';
437 	 i--;
438          FIRST_TIME = 0;
439       }
440       while (i < end)
441       {
442         c = buffer[i++];
443         CMask = RMask[c];
444         if(c != Newline)
445         {
446               r1 = Init1 & r3;
447               r2 = (Next[r3] & CMask) | r1;
448         }
449         else {
450               r1 = Init1 & r3;            /* match against '\n' */
451               r2 = Next[r3] & CMask | r1;
452               j++;
453               if(TAIL) r2 = Next[r2] | r2 ;   /* epsilon move */
454               if(( r2 & 1) ^ INVERSE) {
455                    if(FILENAMEONLY) {
456                           num_of_matched++;
457                           printf("%s\n", CurrentFileName);
458                           return;
459                    }
460                    r_output(buffer, i-1, end, j);
461               }
462               lasti = i - 1;
463               r3 = Init0;
464               r2 = (Next[r3] & CMask) | Init0;
465         }
466         c = buffer[i++];
467         CMask = RMask[c];
468         if(c != Newline)
469         {
470               r1 = Init1 & r2;
471               r3 = (Next[r2] & CMask) | r1;
472         }
473         else {  j++;
474               r1 = Init1 & r2;            /* match against endofline */
475               r3 = Next[r2] & CMask | r1;
476               if(TAIL) r3 = Next[r3] | r3;
477               if(( r3 & 1) ^ INVERSE) {
478                    if(FILENAMEONLY) {
479                           num_of_matched++;
480                           printf("%s\n", CurrentFileName);
481                           return;
482                    }
483                    r_output(buffer, i-1, end, j);
484               }
485               lasti = i - 1;
486               r2 = Init0;
487               r3 = (Next[r2] & CMask) | Init0;  /* match the newline */
488         }
489       } /* while */
490       ResidueSize = Maxline + num_read - lasti;
491       if(ResidueSize > Maxline) {
492            ResidueSize = Maxline;  }
493       strncpy(buffer+Maxline-ResidueSize, buffer+lasti, ResidueSize);
494       lasti = Maxline - ResidueSize;
495     } /* while */
496   return;
497   } /* end if(D==0) */
498   while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
499   {
500     i=Maxline; end = Maxline+num_read;
501     if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
502     if(FIRST_TIME) {
503         buffer[i-1] = '\n';
504 	i--;
505         FIRST_TIME = 0;
506     }
507     while (i < end)
508     {   c = buffer[i++];
509         CMask = RMask[c];
510         if (c != Newline)
511         {
512               r_even = B[0];
513               r1 = Init1 & r_even;
514               A[0] = (Next[r_even] & CMask) | r1;
515               r_odd = B[1];
516               r1 = Init1 & r_odd;
517               r2 = (r_even | Next[r_even|A[0]]) &r_NO_ERR;
518               A[1] = (Next[r_odd] & CMask) | r2 | r1 ;
519                      if(D == 1) goto Nextchar;
520               r_even = B[2];
521               r1 = Init1 & r_even;
522               r2 = (r_odd | Next[r_odd|A[1]]) &r_NO_ERR;
523               A[2] = (Next[r_even] & CMask) | r2 | r1 ;
524                      if(D == 2) goto Nextchar;
525               r_odd = B[3];
526               r1 = Init1 & r_odd;
527               r2 = (r_even | Next[r_even|A[2]]) &r_NO_ERR;
528               A[3] = (Next[r_odd] & CMask) | r2 | r1 ;
529                      if(D == 3) goto Nextchar;
530               r_even = B[4];
531               r1 = Init1 & r_even;
532               r2 = (r_odd | Next[r_odd|A[3]]) &r_NO_ERR;
533               A[4] = (Next[r_even] & CMask) | r2 | r1 ;
534               goto Nextchar;
535         } /* if NOT Newline */
536         else {  j++;
537               r1 = Init1 & B[D];               /* match endofline */
538               A[D] = (Next[B[D]] & CMask) | r1;
539               if(TAIL) A[D] = Next[A[D]] | A[D];
540               if((A[D] & 1) ^ INVERSE )  {
541                    if(FILENAMEONLY) {
542                           num_of_matched++;
543                           printf("%s\n", CurrentFileName);
544                           return;
545                    }
546                    r_output(buffer, i-1, end, j);
547               }
548               for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
549               r1 = Init1 & B[0];
550               A[0] = (Next[B[0]] & CMask) | r1;
551               for(k=1; k<= D; k++) {
552                     r1 = Init1 & B[k];
553                     r2 = (B[k-1] | Next[A[k-1]|B[k-1]]) &r_NO_ERR;
554                     A[k] = (Next[B[k]] & CMask) | r1 | r2;
555               }
556         }
557 Nextchar:
558         c = buffer[i];
559         CMask = RMask[c];
560         if(c != Newline)
561         {
562               r1 = Init1 & A[0];
563               B[0] = (Next[A[0]] & CMask) | r1;
564               r1 = Init1 & A[1];
565               B[1] = (Next[A[1]] & CMask) |                                                          ((A[0] | Next[A[0] | B[0]]) & r_NO_ERR) | r1 ;
566                      if(D == 1) goto Nextchar1;
567               r1 = Init1 & A[2];
568               B[2] = (Next[A[2]] & CMask) | ((A[1] | Next[A[1] | B[1]]) &r_NO_ERR) | r1 ;
569                      if(D == 2) goto Nextchar1;
570               r1 = Init1 & A[3];
571               B[3] = (Next[A[3]] & CMask) | ((A[2] | Next[A[2] | B[2]])&r_NO_ERR) | r1 ;
572                      if(D == 3) goto Nextchar1;
573               r1 = Init1 & A[4];
574               B[4] = (Next[A[4]] & CMask) | ((A[3] | Next[A[3] | B[3]])&r_NO_ERR) | r1 ;
575               goto Nextchar1;
576         } /* if(NOT Newline) */
577         else {  j++;
578               r1 = Init1 & A[D];               /* match endofline */
579               B[D] = (Next[A[D]] & CMask) | r1;
580               if(TAIL) B[D] = Next[B[D]] | B[D];
581               if((B[D] & 1) ^ INVERSE )  {
582                    if(FILENAMEONLY) {
583                           num_of_matched++;
584                           printf("%s\n", CurrentFileName);
585                           return;
586                    }
587                    r_output(buffer, i, end, j);
588               }
589               for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
590               r1 = Init1 & A[0];
591               B[0] = (Next[A[0]] & CMask) | r1;
592               for(k=1; k<= D; k++) {
593                     r1 = Init1 & A[k];
594                     r2 = (A[k-1] | Next[A[k-1]|B[k-1]])&r_NO_ERR;
595                     B[k] = (Next[A[k]] & CMask) | r1 | r2;
596               }
597         }
598 Nextchar1: i++;
599     } /* while i < end */
600     strncpy(buffer, buffer+num_read, Maxline);
601   } /* while  read() */
602   return;
603 } /* re */
604 
605 
r_output(buffer,i,end,j)606 void r_output (buffer, i, end, j)
607 int i, end, j;
608 CHAR *buffer;
609 {
610 int bp;
611       if(i >= end) return;
612       num_of_matched++;
613       if(COUNT)  return;
614       if(FNAME) printf("%s: ", CurrentFileName);
615       bp = i-1;
616       while ((buffer[bp] != '\n') && (bp > 0)) bp--;
617       if(LINENUM) printf("%d: ", j);
618       if(buffer[bp] != '\n') bp = Maxline-1;
619       bp++;
620       while (bp <= i ) putchar(buffer[bp++]);
621 }
622 
main(argc,argv)623 int main(argc, argv)
624 int argc; char *argv[];
625 {
626   int N, M, D=0, fp, fd, i, j;
627   char c;
628   int filetype;
629   unsigned char Pattern[MAXPAT], OldPattern[MAXPAT], temp[MAXPAT];
630 
631   initial_value();
632   strcpy(Progname, argv[0]);
633   if (argc < 2) usage();
634   Pattern[0] = '\0';
635   while(--argc > 0 && (*++argv)[0] == '-') {
636      c = *(argv[0]+1);
637      switch(c) {
638        case 'c' : COUNT = ON;    /* output the # of matched */
639                   break;
640        case 's' : SILENT = ON;   /* silent mode  */
641                   break;
642        case 'p' : I = 0;         /* insertion cost is 0 */
643                   break;
644        case 'x' : WHOLELINE = ON;  /* match the whole line */
645 		  if(WORDBOUND) {
646 			fprintf(stderr, "%s: illegal option combination\n", Progname);
647 			exit(2);
648 		  }
649                   break;
650        case 'L' : break;
651        case 'd' : DELIMITER = ON;  /* user defines delimiter */
652                   if(argc <= 1) usage();
653                   if (argv[0][2] == '\0') {/* space after -d option */
654                     argv++;
655                     if ((D_length = strlen(argv[0])) > MaxDelimit) {
656                       fprintf(stderr, "%s: delimiter pattern too long\n", Progname);
657                       exit(2);
658                     }
659                     D_pattern[0] = '<';
660                     strcpy(D_pattern+1, argv[0]);
661                     argc--;
662                   } else {
663                     if ((D_length = strlen(argv[0]+2)) > MaxDelimit) {
664                       fprintf(stderr, "%s: delimiter pattern too long\n", Progname);
665                       exit(2);
666                     }
667                     D_pattern[0] = '<';
668                     strcpy(D_pattern+1, argv[0]+2);
669                   } /* else */
670                   strcat(D_pattern, ">; ");
671                   D_length++;   /* to count ';' as one */
672                   break;
673        case 'e' : argc--;
674 		  if(argc == 0) {
675 			fprintf(stderr, "%s: the pattern should immediately follow the -e option\n", Progname);
676 			usage();
677 		  }
678 		  if((++argv)[0][0] == '-') {
679                        Pattern[0] = '\\';
680                        strcat(Pattern, (argv)[0]);
681                   }
682                   else strcat(Pattern, argv[0]);
683                   break;
684        case 'f' : PAT_FILE = ON;
685 		  argv++;
686 		  argc--;
687 		  if((fp = open(argv[0], 0)) < 0) {
688 			fprintf(stderr, "%s: Can't open pattern file %s\n", Progname, argv[0]);
689 			exit(2);
690 		  }
691 		  break;
692        case 'h' : NOFILENAME = ON;
693                   break;
694        case 'i' : NOUPPER = ON;
695                   break;
696        case 'k' : argc--;
697 		  if(argc == 0) {
698 			fprintf(stderr, "%s: the pattern should immediately follow the -k option\n", Progname);
699 			usage();
700 		  }
701 		  CONSTANT = ON;
702 		  argv++;
703 		  strcat(Pattern, argv[0]);
704 		  if(argc > 1 && argv[1][0] == '-') {
705 			fprintf(stderr, "%s: -k should be the last option in the command\n", Progname);
706 			exit(2);
707 		  }
708 		  break;
709        case 'l' : FILENAMEONLY = ON;
710                   break;
711        case 'n' : LINENUM = ON;  /* output prefixed by line no*/
712                   break;
713        case 'v' : INVERSE = ON;  /* output no-matched lines */
714                   break;
715        case 't' : OUTTAIL = ON;  /* output from tail of delimiter */
716                   break;
717        case 'B' : BESTMATCH = ON;
718                   break;
719        case 'w' : WORDBOUND = ON;/* match to words */
720 		  if(WHOLELINE) {
721 			fprintf(stderr, "%s: illegal option combination\n", Progname);
722 			exit(2);
723 		  }
724                   break;
725        case 'y' : NOPROMPT = ON;
726 		  break;
727        case 'I' : I = atoi(argv[0]+2);  /* Insertion Cost */
728 	          JUMP = ON;
729                   break;
730        case 'S' : S = atoi(argv[0]+2);  /* Substitution Cost */
731                   JUMP = ON;
732                   break;
733        case 'D' : DD = atoi(argv[0]+2); /* Deletion Cost */
734                   JUMP = ON;
735                   break;
736        case 'G' : FILEOUT = ON;
737 		  COUNT = ON;
738 		  break;
739        default  : if (isdigit(c)) {
740 		    APPROX = ON;
741                     D = atoi(argv[0]+1);
742                     if (D > MaxError) {
743                       fprintf(stderr,"%s: the maximum number of errors is %d \n", Progname, MaxError);
744                       exit(2);
745                     }
746                   } else {
747                     fprintf(stderr, "%s: illegal option  -%c\n",Progname, c);
748 		    usage();
749                   }
750        } /* switch(c) */
751   } /* while (--argc > 0 && (*++argv)[0] == '-') */
752 
753   if (FILENAMEONLY && NOFILENAME) {
754     fprintf(stderr, "%s: -h and -l options are mutually exclusive\n",Progname);
755   }
756   if (COUNT && (FILENAMEONLY || NOFILENAME)) {
757       FILENAMEONLY = OFF;
758       if(!FILEOUT) NOFILENAME = OFF;
759   }
760   if (!(PAT_FILE) && Pattern[0] == '\0') { /* Pattern not set with -e option */
761     if (argc == 0) usage();
762     strncpy(Pattern, *argv, sizeof(Pattern));
763     argc--;
764     argv++;
765   }
766   Numfiles = 0;
767   fd = 3; /* make sure it's not 0 */
768   if (argc == 0)  /* check Pattern against stdin */
769     fd = 0;
770   else {
771     if (!(Textfiles = (CHAR **)malloc(argc * sizeof(CHAR *) ))) {
772       fprintf(stderr, "%s: malloc failure (you probably don't have enough memory)\n", Progname);
773       exit(2);
774     }
775     while (argc--)  { /* one or more filenames on command line -- put
776                           the valid filenames in a array of strings */
777 /*
778       if ((filetype = check_file(*argv)) != ISASCIIFILE) {
779 	if(filetype == NOSUCHFILE) fprintf(stderr,"%s: %s: no such file or directory\n",Progname,*argv);
780         argv++;
781 */
782 
783       if ((filetype = check_file(*argv)) == NOSUCHFILE) {
784 	if(filetype == NOSUCHFILE) fprintf(stderr,"%s: %s: no such file or directory\n",Progname,*argv);
785         argv++;
786       } else { /* file is ascii*/
787         if (!(Textfiles[Numfiles] = (CHAR *)malloc((strlen(*argv)+1)))) {
788           fprintf(stderr, "%s: malloc failure (you probably don't have enough memory)\n", Progname);
789           exit(2);
790         }
791         strcpy(Textfiles[Numfiles++], *argv++);
792 	   } /* else */
793      } /* while (argc--) */
794   } /* else */
795   checksg(Pattern, D);       /* check if the pattern is simple */
796   strcpy(OldPattern, Pattern);
797   if (SGREP == 0) {
798       preprocess(D_pattern, Pattern);
799       strcpy(old_D_pat, D_pattern);
800       M = maskgen(Pattern, D);
801   }
802   else M = strlen(OldPattern);
803   if (PAT_FILE)  prepf(fp);
804   if (Numfiles > 1) FNAME = ON;
805   if (NOFILENAME) FNAME = 0;
806   num_of_matched = 0;
807   compat(); /* check compatibility between options */
808   if (fd == 0) {
809     if(FILENAMEONLY) {
810        fprintf(stderr, "%s: -l option is not compatible with standard input\n", Progname);
811        exit(2);
812     }
813     if(PAT_FILE) mgrep(fd);
814     else {
815     	if(SGREP) sgrep(OldPattern, strlen(OldPattern), fd, D);
816    	else      bitap(old_D_pat, Pattern, fd, M, D);
817     }
818     if (COUNT) {
819 	if(INVERSE && PAT_FILE) printf("%d\n", total_line-num_of_matched);
820 	else printf("%d\n", num_of_matched);
821     }
822   }
823   else {
824     for (i = 0; i < Numfiles; i++, close(fd), num_of_matched = 0) {
825       	strcpy(CurrentFileName, Textfiles[i]);
826       	if ((fd = open(Textfiles[i], 0)) <= 0) {
827             fprintf(stderr, "%s: can't open file %s\n",Progname, Textfiles[i]);
828       	}
829 	else {
830 	     	if(PAT_FILE) mgrep(fd);
831 	     	else {
832              		if(SGREP) sgrep(OldPattern, strlen(OldPattern), fd, D);
833              		else      bitap(old_D_pat, Pattern, fd, M, D);
834 	        }
835         	if (num_of_matched) NOMATCH = OFF;
836         	if (COUNT && !FILEOUT) {
837 			if(INVERSE && PAT_FILE) {
838  		    		if(FNAME) printf("%s: %d\n", CurrentFileName, total_line - num_of_matched);
839 				else printf("%d\n", total_line - num_of_matched);
840 			}
841 			else {
842  		    		if(FNAME) printf("%s: %d\n", CurrentFileName, num_of_matched);
843 				else printf("%d\n", num_of_matched);
844 			}
845         	}  /* if COUNT */
846 		if(FILEOUT && num_of_matched) {
847 			file_out(CurrentFileName);
848 		}
849       	} /* else */
850     }  /* for i < Numfiles */
851     if(NOMATCH && BESTMATCH) {
852 	if(WORDBOUND || WHOLELINE || LINENUM || INVERSE) {
853 		SGREP = 0;
854       		preprocess(D_pattern, Pattern);
855       		strcpy(old_D_pat, D_pattern);
856       		M = maskgen(Pattern, D);
857 	}
858 	COUNT=ON; D=1;
859 	while(D<M && D<=MaxError && num_of_matched == 0) {
860     		for (i = 0; i < Numfiles; i++, close(fd)) {
861       			strcpy(CurrentFileName, Textfiles[i]);
862       			if ((fd = open(Textfiles[i], 0)) > 0) {
863 	     			if(PAT_FILE) mgrep(fd);
864 	     			else {
865              		    		if(SGREP) sgrep(OldPattern,strlen(OldPattern),fd,D);
866              		    		else bitap(old_D_pat,Pattern,fd,M,D);
867       				}
868 			}
869    		}  /* for i < Numfiles */
870 		D++;
871 	} /* while */
872 	if(num_of_matched > 0) {
873 		D--; COUNT = 0;
874 		if(NOPROMPT) goto GO_AHEAD;
875 		if(D==1) fprintf(stderr, "best match has 1 error, ");
876 		else fprintf(stderr, "best match has %d errors, ", D);
877 		fflush(stderr);
878 		if(num_of_matched == 1) fprintf(stderr,"there is 1 match, output it? (y/n)");
879 		else fprintf(stderr,"there are %d matches, output them? (y/n)", num_of_matched);
880 		scanf("%c",&c);
881 		if(c != 'y') goto CONT;
882 GO_AHEAD:
883     		for (i = 0; i < Numfiles; i++, close(fd)) {
884       			strcpy(CurrentFileName, Textfiles[i]);
885       			if ((fd = open(Textfiles[i], 0)) > 0) {
886 	     			if(PAT_FILE) mgrep(fd);
887 	     			else {
888              		    		if(SGREP) sgrep(OldPattern,strlen(OldPattern),fd,D);
889              		    		else bitap(old_D_pat,Pattern,fd,M,D);
890 				}
891       			}
892    		}  /* for i < Numfiles */
893 		NOMATCH = 0;
894 	}
895    }
896  }
897 CONT:
898  if(EATFIRST) {
899       printf("\n");
900       EATFIRST = OFF;
901  }
902  if(num_of_matched) NOMATCH = OFF;
903  if(NOMATCH) exit(1);
904  exit(0);
905 } /* end of main() */
906 
907 
file_out(fname)908 void file_out(fname)
909 char *fname;
910 {
911 int num_read;
912 int fd;
913 int i, len;
914 CHAR buf[4097];
915 	if(FNAME) {
916 		len = strlen(fname);
917 		putchar('\n');
918 		for(i=0; i< len; i++) putchar(':');
919 		putchar('\n');
920 		printf("%s\n", CurrentFileName);
921 		len = strlen(fname);
922 		for(i=0; i< len; i++) putchar(':');
923 		putchar('\n');
924 		fflush(stdout);
925 	}
926 	fd = open(fname, 0);
927 	while((num_read = read(fd, buf, 4096)) > 0)
928 		write(1, buf, num_read);
929 }
930 
931 
usage()932 void usage()
933 {
934     	fprintf(stderr, "usage: %s [-#cdehiklnpstvwxBDGIS] [-f patternfile] pattern [files]\n", Progname);
935  	printf("\n");
936 	fprintf(stderr, "summary of frequently used options:\n");
937 	fprintf(stderr, "-#: find matches with at most # errors\n");
938 	fprintf(stderr, "-c: output the number of matched records\n");
939 	fprintf(stderr, "-d: define record delimiter\n");
940 	fprintf(stderr, "-h: do not output file names\n");
941 	fprintf(stderr, "-i: case-insensitive search, e.g., 'a' = 'A'\n");
942 	fprintf(stderr, "-l: output the names of files that contain a match\n");
943 	fprintf(stderr, "-n: output record prefixed by record number\n");
944 	fprintf(stderr, "-v: output those records containing no matches\n");
945 	fprintf(stderr, "-w: pattern has to match as a word, e.g., 'win' will not match 'wind'\n");
946 	fprintf(stderr, "-B: best match mode. find the closest matches to the pattern\n");
947 	fprintf(stderr, "-G: output the files that contain a match\n");
948  	printf("\n");
949 
950     	exit(2);
951 }
952 
checksg(Pattern,D)953 void checksg(Pattern, D)
954 CHAR *Pattern; int D;
955 {
956   char c;
957   int i, m;
958   m = strlen(Pattern);
959   if(!(PAT_FILE) && m <= D) {
960       fprintf(stderr, "%s: size of pattern must be greater than number of errors\n", Progname);
961       exit(2);
962   }
963   SIMPLEPATTERN = ON;
964   for (i=0; i < m; i++)
965   {
966       switch(Pattern[i])
967       {
968          case ';' : SIMPLEPATTERN = OFF; break;
969          case ',' : SIMPLEPATTERN = OFF; break;
970          case '.' : SIMPLEPATTERN = OFF; break;
971          case '*' : SIMPLEPATTERN = OFF; break;
972          case '-' : SIMPLEPATTERN = OFF; break;
973          case '[' : SIMPLEPATTERN = OFF; break;
974          case ']' : SIMPLEPATTERN = OFF; break;
975          case '(' : SIMPLEPATTERN = OFF; break;
976          case ')' : SIMPLEPATTERN = OFF; break;
977          case '<' : SIMPLEPATTERN = OFF; break;
978          case '>' : SIMPLEPATTERN = OFF; break;
979          case '^' : if(D > 0) SIMPLEPATTERN = OFF;
980 		    break;
981          case '$' : if(D > 0) SIMPLEPATTERN = OFF;
982 		    break;
983          case '|' : SIMPLEPATTERN = OFF; break;
984          case '#' : SIMPLEPATTERN = OFF; break;
985          case '\\' : SIMPLEPATTERN = OFF; break;
986          default  : break;
987       }
988   }
989   if (CONSTANT) SIMPLEPATTERN = ON;
990   if (SIMPLEPATTERN == OFF) return;
991   if (NOUPPER && D) return;
992   if (JUMP == ON) return;
993   if (I == 0) return;
994   if (LINENUM) return;
995   if (DELIMITER) return;
996   if (INVERSE) return;
997   if (WORDBOUND && D > 0) return;
998   if (WHOLELINE && D > 0) return;
999   if (SILENT) return;     /* REMINDER: to be removed */
1000   SGREP = ON;
1001   if(m >= 16) DNA = ON;
1002   for(i=0; i<m; i++) {
1003 	c = Pattern[i];
1004 	if(c == 'a' || c == 'c' || c == 't' || c == 'g' ) ;
1005 	else DNA = OFF;
1006   }
1007   return;
1008 }
1009 
1010 void output (register CHAR *buffer, int i1, int i2, int j);
output(buffer,i1,i2,j)1011 void output (buffer, i1, i2, j)
1012 register CHAR *buffer; int i1, i2, j;
1013 {
1014 register CHAR *bp, *outend;
1015 	if(i1 > i2) return;
1016         num_of_matched++;
1017         if(COUNT)  return;
1018         if(SILENT) return;
1019 	if(OUTTAIL) {
1020               i1 = i1 + D_length;
1021               i2 = i2 + D_length;
1022         }
1023         if(DELIMITER) j = j+1;
1024         if(FIRSTOUTPUT) {
1025            if (buffer[i1] == '\n')  {
1026                i1++;
1027                EATFIRST = ON;
1028            }
1029            FIRSTOUTPUT = 0;
1030         }
1031         if(TRUNCATE) {
1032            fprintf(stderr, "WARNING!!!  some lines have been truncated in output record #%d\n", num_of_matched-1);
1033         }
1034         while(buffer[i1] == '\n' && i1 <= i2) {
1035 	   printf("\n");
1036            i1++;
1037         }
1038         if(FNAME == ON) printf("%s: ", CurrentFileName);
1039         if(LINENUM) printf("%d: ", j-1);
1040 	bp = buffer + i1;
1041 	outend = buffer + i2;
1042 	while(bp <= outend) putchar(*bp++);
1043 }
1044 
1045 /* end of main.c */
1046