1 /* Copyright (c) 1991 Sun Wu and Udi Manber. All Rights Reserved. */
2 #include "agrep.h"
3 #include "checkfile.h"
4 #include <unistd.h>
5 #include <fcntl.h>
6
7 int exponen(int m);
8 void r_output (CHAR *buffer, int i, int end, int j);
9 void file_out(char *fname);
10 void usage(void);
11 void checksg(CHAR *Pattern, int D);
12
13 unsigned Mask[MAXSYM];
14 unsigned Init1, NO_ERR_MASK, Init[MaxError];
15 unsigned Bit[WORD+1];
16 CHAR buffer[BlockSize+Maxline+1];
17 unsigned Next[MaxNext], Next1[MaxNext];
18 unsigned wildmask, endposition, D_endpos;
19 int REGEX, RE_ERR, FNAME, WHOLELINE, SIMPLEPATTERN;
20 int COUNT, HEAD, TAIL, LINENUM, INVERSE, I, S, DD, AND, SGREP, JUMP;
21 int Num_Pat, PSIZE, num_of_matched, SILENT, NOPROMPT, BESTMATCH, NOUPPER;
22 int NOMATCH, TRUNCATE, FIRST_IN_RE, FIRSTOUTPUT;
23 int WORDBOUND, DELIMITER, D_length;
24 int EATFIRST, OUTTAIL;
25 int FILEOUT;
26 int DNA = 0;
27 int APPROX = 0;
28 int PAT_FILE = 0;
29 int CONSTANT = 0;
30 int total_line = 0; /* used in mgrep */
31
32 CHAR **Textfiles; /* array of filenames to be searched */
33 CHAR old_D_pat[MaxDelimit] = "\n"; /* to hold original D_pattern */
34 CHAR CurrentFileName[MAXNAME];
35 CHAR Progname[MAXNAME];
36 CHAR D_pattern[MaxDelimit] = "\n; "; /* string which delimits records --
37 defaults to newline */
38 int NOFILENAME = 0, /* Boolean flag, set for -h option */
39 FILENAMEONLY = 0,/* Boolean flag, set for -l option */
40 Numfiles = 0; /* indicates how many files in Textfiles */
41 extern int init();
42 int table[WORD][WORD];
43
initial_value()44 void initial_value()
45 {
46 int i;
47
48 JUMP = REGEX = FNAME = BESTMATCH = NOPROMPT = NOUPPER = 0;
49 COUNT = LINENUM = WHOLELINE = SGREP = 0;
50 EATFIRST = INVERSE = AND = TRUNCATE = OUTTAIL = 0;
51 FIRST_IN_RE = NOMATCH = FIRSTOUTPUT = ON;
52 I = DD = S = 1;
53 HEAD = TAIL = ON;
54 D_length = 2;
55 SILENT = Num_Pat = PSIZE = SIMPLEPATTERN = num_of_matched = 0 ;
56 WORDBOUND = DELIMITER = RE_ERR = 0;
57 Bit[WORD] = 1;
58 for (i = WORD - 1; i > 0 ; i--) Bit[i] = Bit[i+1] << 1;
59 for (i=0; i< MAXSYM; i++) Mask[i] = 0;
60 }
61
compute_next(M,Next,Next1)62 void compute_next(M, Next, Next1)
63 int M; unsigned *Next, *Next1;
64 {
65 int i, j=0, n, k, temp;
66 int mid, pp;
67 int MM, base;
68 unsigned V[WORD];
69
70 base = WORD - M;
71 temp = Bit[base]; Bit[base] = 0;
72 for (i=0; i<WORD; i++) V[i] = 0;
73 for (i=1; i<M; i++)
74 {
75 j=0;
76 while (table[i][j] > 0 && j < 10) {
77 V[i] = V[i] | Bit[base + table[i][j++]];
78 }
79 }
80 Bit[base]=temp;
81 if(M <= SHORTREG)
82 {
83 k = exponen(M);
84 pp = 2*k;
85 for(i=k; i<pp ; i++)
86 { n = i;
87 Next[i]= (k>>1);
88 for(j=M; j>=1; j--)
89 {
90 if(n & Bit[WORD]) Next[i] = Next[i] | V[j];
91 n = (n>>1);
92 }
93 }
94 return;
95 }
96 if(M > MAXREG) fprintf(stderr, "%s: regular expression too long\n", Progname);
97 MM = M;
98 if(M & 1) M=M+1;
99 k = exponen(M/2);
100 pp = 2*k;
101 mid = MM/2;
102 for(i=k; i<pp ; i++)
103 { n = i;
104 Next[i]= (Bit[base]>>1);
105 for(j=MM; j>mid ; j--)
106 {
107 if(n & Bit[WORD]) Next[i] = Next[i] | V[j-mid];
108 n = (n>>1);
109 }
110 n=i-k;
111 Next1[i-k] = 0;
112 for(j = 0; j<mid; j++)
113 {
114 if(n & Bit[WORD]) Next1[i-k] = Next1[i-k] | V[MM-j];
115 n = (n>>1);
116 }
117 }
118 return;
119 }
120
exponen(m)121 int exponen(m)
122 int m;
123 { int i, ex;
124 ex= 1;
125 for (i=0; i<m; i++) ex= ex*2;
126 return(ex);
127 }
128
re1(Text,M,D)129 void re1(Text, M, D)
130 int Text, M, D;
131 {
132 register unsigned i, c, r0, r1, r2, r3, CMask, Newline, Init0, r_NO_ERR;
133 register unsigned end;
134 register unsigned hh, LL=0, k; /* Lower part */
135 int FIRST_TIME=ON, num_read , j=0, base;
136 unsigned A[MaxRerror+1], B[MaxRerror+1];
137 unsigned Next[MaxNext], Next1[MaxNext];
138 CHAR buffer[BlockSize+Maxline+1];
139 int FIRST_LOOP = 1;
140
141 r_NO_ERR = NO_ERR_MASK;
142 if(M > 30) {
143 fprintf(stderr, "%s: regular expression too long\n", Progname);
144 exit(2);
145 }
146 base = WORD - M;
147 hh = M/2;
148 for(i=WORD, j=0; j < hh ; i--, j++) LL = LL | Bit[i];
149 if(FIRST_IN_RE) compute_next(M, Next, Next1);
150 /*SUN: try: change to memory allocation */
151 FIRST_IN_RE = 0;
152 Newline = '\n';
153 Init[0] = Bit[base];
154 if(HEAD) Init[0] = Init[0] | Bit[base+1];
155 for(i=1; i<= D; i++) Init[i] = Init[i-1] | Next[Init[i-1]>>hh] | Next1[Init[i-1]&LL];
156 Init1 = Init[0] | 1;
157 Init0 = Init[0];
158 r2 = r3 = Init[0];
159 for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
160 if ( D == 0 )
161 {
162 while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
163 {
164 i=Maxline; end = num_read + Maxline;
165 if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
166 if(FIRST_LOOP) { /* if first time in the loop add a newline */
167 buffer[i-1] = '\n'; /* in front the text. */
168 i--;
169 FIRST_LOOP = 0;
170 }
171 while ( i < end )
172 {
173 c = buffer[i++];
174 CMask = Mask[c];
175 if(c != Newline)
176 { if(CMask != 0) {
177 r1 = Init1 & r3;
178 r2 = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | r1;
179 }
180 else {
181 r2 = r3 & Init1;
182 }
183 }
184 else { j++;
185 r1 = Init1 & r3; /* match against endofline */
186 r2 = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | r1;
187 if(TAIL) r2 = (Next[r2>>hh] | Next1[r2&LL]) | r2; /* epsilon move */
188 if(( r2 & 1 ) ^ INVERSE) {
189 if(FILENAMEONLY) {
190 num_of_matched++;
191 printf("%s\n", CurrentFileName);
192 return;
193 }
194 r_output(buffer, i-1, end, j);
195 }
196 r3 = Init0;
197 r2 = (Next[r3>>hh] | Next1[r3&LL]) & CMask | Init0;
198 /* match begin of line */
199 }
200 c = buffer[i++];
201 CMask = Mask[c];
202 if(c != Newline)
203 { if(CMask != 0) {
204 r1 = Init1 & r2;
205 r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
206 }
207 else r3 = r2 & Init1;
208 } /* if(NOT Newline) */
209 else { j++;
210 r1 = Init1 & r2; /* match against endofline */
211 r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
212 if(TAIL) r3 = ( Next[r3>>hh] | Next1[r3&LL] ) | r3;
213 /* epsilon move */
214 if(( r3 & 1 ) ^ INVERSE) {
215 if(FILENAMEONLY) {
216 num_of_matched++;
217 printf("%s\n", CurrentFileName);
218 return;
219 }
220 r_output(buffer, i-1, end, j);
221 }
222 r2 = Init0;
223 r3 = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | Init0;
224 /* match begin of line */
225 }
226 } /* while i < end ... */
227 strncpy(buffer, buffer+num_read, Maxline);
228 } /* end while read()... */
229 return;
230 } /* end if (D == 0) */
231 while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
232 {
233 i=Maxline; end = Maxline + num_read;
234 if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
235 if(FIRST_TIME) { /* if first time in the loop add a newline */
236 buffer[i-1] = '\n'; /* in front the text. */
237 i--;
238 FIRST_TIME = 0;
239 }
240 while (i < end )
241 {
242 c = buffer[i];
243 CMask = Mask[c];
244 if(c != Newline)
245 {
246 if(CMask != 0) {
247 r2 = B[0];
248 r1 = Init1 & r2;
249 A[0] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
250 r3 = B[1];
251 r1 = Init1 & r3;
252 r0 = r2 | A[0]; /* A[0] | B[0] */
253 A[1] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | (( r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
254 if(D == 1) goto Nextchar;
255 r2 = B[2];
256 r1 = Init1 & r2;
257 r0 = r3 | A[1];
258 A[2] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
259 if(D == 2) goto Nextchar;
260 r3 = B[3];
261 r1 = Init1 & r3;
262 r0 = r2 | A[2];
263 A[3] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
264 if(D == 3) goto Nextchar;
265 r2 = B[4];
266 r1 = Init1 & r2;
267 r0 = r3 | A[3];
268 A[4] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
269 if(D == 4) goto Nextchar;
270 } /* if(CMask) */
271 else {
272 r2 = B[0];
273 A[0] = r2 & Init1;
274 r3 = B[1];
275 r1 = Init1 & r3;
276 r0 = r2 | A[0];
277 A[1] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
278 if(D == 1) goto Nextchar;
279 r2 = B[2];
280 r1 = Init1 & r2;
281 r0 = r3 | A[1];
282 A[2] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
283 if(D == 2) goto Nextchar;
284 r3 = B[3];
285 r1 = Init1 & r3;
286 r0 = r2 | A[2];
287 A[3] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
288 if(D == 3) goto Nextchar;
289 r2 = B[4];
290 r1 = Init1 & r2;
291 r0 = r3 | A[3];
292 A[4] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
293 if(D == 4) goto Nextchar;
294 }
295 }
296 else { j++;
297 r1 = Init1 & B[D]; /* match against endofline */
298 A[D] = ((Next[B[D]>>hh] | Next1[B[D]&LL]) & CMask) | r1;
299 if(TAIL) A[D] = ( Next[A[D]>>hh] | Next1[A[D]&LL] ) | A[D];
300 /* epsilon move */
301 if(( A[D] & 1 ) ^ INVERSE) {
302 if(FILENAMEONLY) {
303 num_of_matched++;
304 printf("%s\n", CurrentFileName);
305 return;
306 }
307 r_output(buffer, i, end, j);
308 }
309 for(k=0; k<=D; k++) B[k] = Init[0];
310 r1 = Init1 & B[0];
311 A[0] = (( Next[B[0]>>hh] | Next1[B[0]&LL]) & CMask) | r1;
312 for(k=1; k<=D; k++) {
313 r3 = B[k];
314 r1 = Init1 & r3;
315 r2 = A[k-1] | B[k-1];
316 A[k] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | ((B[k-1] | Next[r2>>hh] | Next1[r2&LL]) & r_NO_ERR) | r1;
317 }
318 }
319 Nextchar: i=i+1;
320 c = buffer[i];
321 CMask = Mask[c];
322 if(c != Newline)
323 {
324 if(CMask != 0) {
325 r2 = A[0];
326 r1 = Init1 & r2;
327 B[0] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | r1;
328 r3 = A[1];
329 r1 = Init1 & r3;
330 r0 = B[0] | r2;
331 B[1] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | ((r2 | Next[r0>>hh] | Next1[r0&LL]) & r_NO_ERR) | r1 ;
332 if(D == 1) goto Nextchar1;
333 r2 = A[2];
334 r1 = Init1 & r2;
335 r0 = B[1] | r3;
336 B[2] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
337 if(D == 2) goto Nextchar1;
338 r3 = A[3];
339 r1 = Init1 & r3;
340 r0 = B[2] | r2;
341 B[3] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
342 if(D == 3) goto Nextchar1;
343 r2 = A[4];
344 r1 = Init1 & r2;
345 r0 = B[3] | r3;
346 B[4] = ((Next[r2>>hh] | Next1[r2&LL]) & CMask) | ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
347 if(D == 4) goto Nextchar1;
348 } /* if(CMask) */
349 else {
350 r2 = A[0];
351 B[0] = r2 & Init1;
352 r3 = A[1];
353 r1 = Init1 & r3;
354 r0 = B[0] | r2;
355 B[1] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
356 if(D == 1) goto Nextchar1;
357 r2 = A[2];
358 r1 = Init1 & r2;
359 r0 = B[1] | r3;
360 B[2] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
361 if(D == 2) goto Nextchar1;
362 r3 = A[3];
363 r1 = Init1 & r3;
364 r0 = B[2] | r2;
365 B[3] = ((r2 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
366 if(D == 3) goto Nextchar1;
367 r2 = A[4];
368 r1 = Init1 & r2;
369 r0 = B[3] | r3;
370 B[4] = ((r3 | Next[r0>>hh] | Next1[r0&LL])&r_NO_ERR) | r1 ;
371 if(D == 4) goto Nextchar1;
372 }
373 } /* if(NOT Newline) */
374 else { j++;
375 r1 = Init1 & A[D]; /* match against endofline */
376 B[D] = ((Next[A[D]>>hh] | Next1[A[D]&LL]) & CMask) | r1;
377 if(TAIL) B[D] = ( Next[B[D]>>hh] | Next1[B[D]&LL] ) | B[D];
378 /* epsilon move */
379 if(( B[D] & 1 ) ^ INVERSE) {
380 if(FILENAMEONLY) {
381 num_of_matched++;
382 printf("%s\n", CurrentFileName);
383 return;
384 }
385 r_output(buffer, i, end, j);
386 }
387 for(k=0; k<=D; k++) A[k] = Init0;
388 r1 = Init1 & A[0];
389 B[0] = ((Next[A[0]>>hh] | Next1[A[0]&LL]) & CMask) | r1;
390 for(k=1; k<=D; k++) {
391 r3 = A[k];
392 r1 = Init1 & r3;
393 r2 = A[k-1] | B[k-1];
394 B[k] = ((Next[r3>>hh] | Next1[r3&LL]) & CMask) | ((A[k-1] | Next[r2>>hh] | Next1[r2&LL]) & r_NO_ERR) | r1;
395 }
396 }
397 Nextchar1: i=i+1;
398 } /* while */
399 strncpy(buffer, buffer+num_read, Maxline);
400 } /* while */
401 return;
402 } /* re1 */
403
re(Text,M,D)404 void re(Text, M, D)
405 int Text, M, D;
406 {
407 register unsigned i, c, r1, r2, r3, CMask, k, Newline, Init0, Init1, end;
408 register unsigned r_even, r_odd, r_NO_ERR ;
409 unsigned RMask[MAXSYM];
410 unsigned A[MaxRerror+1], B[MaxRerror+1];
411 int num_read, j=0, bp, lasti, base, ResidueSize;
412 int FIRST_TIME; /* Flag */
413 base = WORD - M;
414 k = 2*exponen(M);
415 if(FIRST_IN_RE) {
416 compute_next(M, Next, Next1);
417 FIRST_IN_RE = 0; }
418 for(i=0; i< MAXSYM; i++) RMask[i] = Mask[i];
419 r_NO_ERR = NO_ERR_MASK;
420 Newline = '\n';
421 lasti = Maxline;
422 Init0 = Init[0] = Bit[base];
423 if(HEAD) Init0 = Init[0] = Init0 | Bit[base+1] ;
424 for(i=1; i<= D; i++) Init[i] = Init[i-1] | Next[Init[i-1]]; /* can be out? */
425 Init1 = Init0 | 1;
426 r2 = r3 = Init0;
427 for(k=0; k<= D; k++) { A[k] = B[k] = Init[0]; } /* can be out? */
428 FIRST_TIME = ON;
429 if ( D == 0 )
430 {
431 while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
432 {
433 i=Maxline; end = Maxline + num_read ;
434 if((num_read < BlockSize)&&buffer[end-1] != '\n') buffer[end] = '\n';
435 if(FIRST_TIME) {
436 buffer[i-1] = '\n';
437 i--;
438 FIRST_TIME = 0;
439 }
440 while (i < end)
441 {
442 c = buffer[i++];
443 CMask = RMask[c];
444 if(c != Newline)
445 {
446 r1 = Init1 & r3;
447 r2 = (Next[r3] & CMask) | r1;
448 }
449 else {
450 r1 = Init1 & r3; /* match against '\n' */
451 r2 = Next[r3] & CMask | r1;
452 j++;
453 if(TAIL) r2 = Next[r2] | r2 ; /* epsilon move */
454 if(( r2 & 1) ^ INVERSE) {
455 if(FILENAMEONLY) {
456 num_of_matched++;
457 printf("%s\n", CurrentFileName);
458 return;
459 }
460 r_output(buffer, i-1, end, j);
461 }
462 lasti = i - 1;
463 r3 = Init0;
464 r2 = (Next[r3] & CMask) | Init0;
465 }
466 c = buffer[i++];
467 CMask = RMask[c];
468 if(c != Newline)
469 {
470 r1 = Init1 & r2;
471 r3 = (Next[r2] & CMask) | r1;
472 }
473 else { j++;
474 r1 = Init1 & r2; /* match against endofline */
475 r3 = Next[r2] & CMask | r1;
476 if(TAIL) r3 = Next[r3] | r3;
477 if(( r3 & 1) ^ INVERSE) {
478 if(FILENAMEONLY) {
479 num_of_matched++;
480 printf("%s\n", CurrentFileName);
481 return;
482 }
483 r_output(buffer, i-1, end, j);
484 }
485 lasti = i - 1;
486 r2 = Init0;
487 r3 = (Next[r2] & CMask) | Init0; /* match the newline */
488 }
489 } /* while */
490 ResidueSize = Maxline + num_read - lasti;
491 if(ResidueSize > Maxline) {
492 ResidueSize = Maxline; }
493 strncpy(buffer+Maxline-ResidueSize, buffer+lasti, ResidueSize);
494 lasti = Maxline - ResidueSize;
495 } /* while */
496 return;
497 } /* end if(D==0) */
498 while ((num_read = read(Text, buffer + Maxline, BlockSize)) > 0)
499 {
500 i=Maxline; end = Maxline+num_read;
501 if((num_read < BlockSize) && buffer[end-1] != '\n') buffer[end] = '\n';
502 if(FIRST_TIME) {
503 buffer[i-1] = '\n';
504 i--;
505 FIRST_TIME = 0;
506 }
507 while (i < end)
508 { c = buffer[i++];
509 CMask = RMask[c];
510 if (c != Newline)
511 {
512 r_even = B[0];
513 r1 = Init1 & r_even;
514 A[0] = (Next[r_even] & CMask) | r1;
515 r_odd = B[1];
516 r1 = Init1 & r_odd;
517 r2 = (r_even | Next[r_even|A[0]]) &r_NO_ERR;
518 A[1] = (Next[r_odd] & CMask) | r2 | r1 ;
519 if(D == 1) goto Nextchar;
520 r_even = B[2];
521 r1 = Init1 & r_even;
522 r2 = (r_odd | Next[r_odd|A[1]]) &r_NO_ERR;
523 A[2] = (Next[r_even] & CMask) | r2 | r1 ;
524 if(D == 2) goto Nextchar;
525 r_odd = B[3];
526 r1 = Init1 & r_odd;
527 r2 = (r_even | Next[r_even|A[2]]) &r_NO_ERR;
528 A[3] = (Next[r_odd] & CMask) | r2 | r1 ;
529 if(D == 3) goto Nextchar;
530 r_even = B[4];
531 r1 = Init1 & r_even;
532 r2 = (r_odd | Next[r_odd|A[3]]) &r_NO_ERR;
533 A[4] = (Next[r_even] & CMask) | r2 | r1 ;
534 goto Nextchar;
535 } /* if NOT Newline */
536 else { j++;
537 r1 = Init1 & B[D]; /* match endofline */
538 A[D] = (Next[B[D]] & CMask) | r1;
539 if(TAIL) A[D] = Next[A[D]] | A[D];
540 if((A[D] & 1) ^ INVERSE ) {
541 if(FILENAMEONLY) {
542 num_of_matched++;
543 printf("%s\n", CurrentFileName);
544 return;
545 }
546 r_output(buffer, i-1, end, j);
547 }
548 for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
549 r1 = Init1 & B[0];
550 A[0] = (Next[B[0]] & CMask) | r1;
551 for(k=1; k<= D; k++) {
552 r1 = Init1 & B[k];
553 r2 = (B[k-1] | Next[A[k-1]|B[k-1]]) &r_NO_ERR;
554 A[k] = (Next[B[k]] & CMask) | r1 | r2;
555 }
556 }
557 Nextchar:
558 c = buffer[i];
559 CMask = RMask[c];
560 if(c != Newline)
561 {
562 r1 = Init1 & A[0];
563 B[0] = (Next[A[0]] & CMask) | r1;
564 r1 = Init1 & A[1];
565 B[1] = (Next[A[1]] & CMask) | ((A[0] | Next[A[0] | B[0]]) & r_NO_ERR) | r1 ;
566 if(D == 1) goto Nextchar1;
567 r1 = Init1 & A[2];
568 B[2] = (Next[A[2]] & CMask) | ((A[1] | Next[A[1] | B[1]]) &r_NO_ERR) | r1 ;
569 if(D == 2) goto Nextchar1;
570 r1 = Init1 & A[3];
571 B[3] = (Next[A[3]] & CMask) | ((A[2] | Next[A[2] | B[2]])&r_NO_ERR) | r1 ;
572 if(D == 3) goto Nextchar1;
573 r1 = Init1 & A[4];
574 B[4] = (Next[A[4]] & CMask) | ((A[3] | Next[A[3] | B[3]])&r_NO_ERR) | r1 ;
575 goto Nextchar1;
576 } /* if(NOT Newline) */
577 else { j++;
578 r1 = Init1 & A[D]; /* match endofline */
579 B[D] = (Next[A[D]] & CMask) | r1;
580 if(TAIL) B[D] = Next[B[D]] | B[D];
581 if((B[D] & 1) ^ INVERSE ) {
582 if(FILENAMEONLY) {
583 num_of_matched++;
584 printf("%s\n", CurrentFileName);
585 return;
586 }
587 r_output(buffer, i, end, j);
588 }
589 for(k=0; k<= D; k++) { A[k] = B[k] = Init[k]; }
590 r1 = Init1 & A[0];
591 B[0] = (Next[A[0]] & CMask) | r1;
592 for(k=1; k<= D; k++) {
593 r1 = Init1 & A[k];
594 r2 = (A[k-1] | Next[A[k-1]|B[k-1]])&r_NO_ERR;
595 B[k] = (Next[A[k]] & CMask) | r1 | r2;
596 }
597 }
598 Nextchar1: i++;
599 } /* while i < end */
600 strncpy(buffer, buffer+num_read, Maxline);
601 } /* while read() */
602 return;
603 } /* re */
604
605
r_output(buffer,i,end,j)606 void r_output (buffer, i, end, j)
607 int i, end, j;
608 CHAR *buffer;
609 {
610 int bp;
611 if(i >= end) return;
612 num_of_matched++;
613 if(COUNT) return;
614 if(FNAME) printf("%s: ", CurrentFileName);
615 bp = i-1;
616 while ((buffer[bp] != '\n') && (bp > 0)) bp--;
617 if(LINENUM) printf("%d: ", j);
618 if(buffer[bp] != '\n') bp = Maxline-1;
619 bp++;
620 while (bp <= i ) putchar(buffer[bp++]);
621 }
622
main(argc,argv)623 int main(argc, argv)
624 int argc; char *argv[];
625 {
626 int N, M, D=0, fp, fd, i, j;
627 char c;
628 int filetype;
629 unsigned char Pattern[MAXPAT], OldPattern[MAXPAT], temp[MAXPAT];
630
631 initial_value();
632 strcpy(Progname, argv[0]);
633 if (argc < 2) usage();
634 Pattern[0] = '\0';
635 while(--argc > 0 && (*++argv)[0] == '-') {
636 c = *(argv[0]+1);
637 switch(c) {
638 case 'c' : COUNT = ON; /* output the # of matched */
639 break;
640 case 's' : SILENT = ON; /* silent mode */
641 break;
642 case 'p' : I = 0; /* insertion cost is 0 */
643 break;
644 case 'x' : WHOLELINE = ON; /* match the whole line */
645 if(WORDBOUND) {
646 fprintf(stderr, "%s: illegal option combination\n", Progname);
647 exit(2);
648 }
649 break;
650 case 'L' : break;
651 case 'd' : DELIMITER = ON; /* user defines delimiter */
652 if(argc <= 1) usage();
653 if (argv[0][2] == '\0') {/* space after -d option */
654 argv++;
655 if ((D_length = strlen(argv[0])) > MaxDelimit) {
656 fprintf(stderr, "%s: delimiter pattern too long\n", Progname);
657 exit(2);
658 }
659 D_pattern[0] = '<';
660 strcpy(D_pattern+1, argv[0]);
661 argc--;
662 } else {
663 if ((D_length = strlen(argv[0]+2)) > MaxDelimit) {
664 fprintf(stderr, "%s: delimiter pattern too long\n", Progname);
665 exit(2);
666 }
667 D_pattern[0] = '<';
668 strcpy(D_pattern+1, argv[0]+2);
669 } /* else */
670 strcat(D_pattern, ">; ");
671 D_length++; /* to count ';' as one */
672 break;
673 case 'e' : argc--;
674 if(argc == 0) {
675 fprintf(stderr, "%s: the pattern should immediately follow the -e option\n", Progname);
676 usage();
677 }
678 if((++argv)[0][0] == '-') {
679 Pattern[0] = '\\';
680 strcat(Pattern, (argv)[0]);
681 }
682 else strcat(Pattern, argv[0]);
683 break;
684 case 'f' : PAT_FILE = ON;
685 argv++;
686 argc--;
687 if((fp = open(argv[0], 0)) < 0) {
688 fprintf(stderr, "%s: Can't open pattern file %s\n", Progname, argv[0]);
689 exit(2);
690 }
691 break;
692 case 'h' : NOFILENAME = ON;
693 break;
694 case 'i' : NOUPPER = ON;
695 break;
696 case 'k' : argc--;
697 if(argc == 0) {
698 fprintf(stderr, "%s: the pattern should immediately follow the -k option\n", Progname);
699 usage();
700 }
701 CONSTANT = ON;
702 argv++;
703 strcat(Pattern, argv[0]);
704 if(argc > 1 && argv[1][0] == '-') {
705 fprintf(stderr, "%s: -k should be the last option in the command\n", Progname);
706 exit(2);
707 }
708 break;
709 case 'l' : FILENAMEONLY = ON;
710 break;
711 case 'n' : LINENUM = ON; /* output prefixed by line no*/
712 break;
713 case 'v' : INVERSE = ON; /* output no-matched lines */
714 break;
715 case 't' : OUTTAIL = ON; /* output from tail of delimiter */
716 break;
717 case 'B' : BESTMATCH = ON;
718 break;
719 case 'w' : WORDBOUND = ON;/* match to words */
720 if(WHOLELINE) {
721 fprintf(stderr, "%s: illegal option combination\n", Progname);
722 exit(2);
723 }
724 break;
725 case 'y' : NOPROMPT = ON;
726 break;
727 case 'I' : I = atoi(argv[0]+2); /* Insertion Cost */
728 JUMP = ON;
729 break;
730 case 'S' : S = atoi(argv[0]+2); /* Substitution Cost */
731 JUMP = ON;
732 break;
733 case 'D' : DD = atoi(argv[0]+2); /* Deletion Cost */
734 JUMP = ON;
735 break;
736 case 'G' : FILEOUT = ON;
737 COUNT = ON;
738 break;
739 default : if (isdigit(c)) {
740 APPROX = ON;
741 D = atoi(argv[0]+1);
742 if (D > MaxError) {
743 fprintf(stderr,"%s: the maximum number of errors is %d \n", Progname, MaxError);
744 exit(2);
745 }
746 } else {
747 fprintf(stderr, "%s: illegal option -%c\n",Progname, c);
748 usage();
749 }
750 } /* switch(c) */
751 } /* while (--argc > 0 && (*++argv)[0] == '-') */
752
753 if (FILENAMEONLY && NOFILENAME) {
754 fprintf(stderr, "%s: -h and -l options are mutually exclusive\n",Progname);
755 }
756 if (COUNT && (FILENAMEONLY || NOFILENAME)) {
757 FILENAMEONLY = OFF;
758 if(!FILEOUT) NOFILENAME = OFF;
759 }
760 if (!(PAT_FILE) && Pattern[0] == '\0') { /* Pattern not set with -e option */
761 if (argc == 0) usage();
762 strncpy(Pattern, *argv, sizeof(Pattern));
763 argc--;
764 argv++;
765 }
766 Numfiles = 0;
767 fd = 3; /* make sure it's not 0 */
768 if (argc == 0) /* check Pattern against stdin */
769 fd = 0;
770 else {
771 if (!(Textfiles = (CHAR **)malloc(argc * sizeof(CHAR *) ))) {
772 fprintf(stderr, "%s: malloc failure (you probably don't have enough memory)\n", Progname);
773 exit(2);
774 }
775 while (argc--) { /* one or more filenames on command line -- put
776 the valid filenames in a array of strings */
777 /*
778 if ((filetype = check_file(*argv)) != ISASCIIFILE) {
779 if(filetype == NOSUCHFILE) fprintf(stderr,"%s: %s: no such file or directory\n",Progname,*argv);
780 argv++;
781 */
782
783 if ((filetype = check_file(*argv)) == NOSUCHFILE) {
784 if(filetype == NOSUCHFILE) fprintf(stderr,"%s: %s: no such file or directory\n",Progname,*argv);
785 argv++;
786 } else { /* file is ascii*/
787 if (!(Textfiles[Numfiles] = (CHAR *)malloc((strlen(*argv)+1)))) {
788 fprintf(stderr, "%s: malloc failure (you probably don't have enough memory)\n", Progname);
789 exit(2);
790 }
791 strcpy(Textfiles[Numfiles++], *argv++);
792 } /* else */
793 } /* while (argc--) */
794 } /* else */
795 checksg(Pattern, D); /* check if the pattern is simple */
796 strcpy(OldPattern, Pattern);
797 if (SGREP == 0) {
798 preprocess(D_pattern, Pattern);
799 strcpy(old_D_pat, D_pattern);
800 M = maskgen(Pattern, D);
801 }
802 else M = strlen(OldPattern);
803 if (PAT_FILE) prepf(fp);
804 if (Numfiles > 1) FNAME = ON;
805 if (NOFILENAME) FNAME = 0;
806 num_of_matched = 0;
807 compat(); /* check compatibility between options */
808 if (fd == 0) {
809 if(FILENAMEONLY) {
810 fprintf(stderr, "%s: -l option is not compatible with standard input\n", Progname);
811 exit(2);
812 }
813 if(PAT_FILE) mgrep(fd);
814 else {
815 if(SGREP) sgrep(OldPattern, strlen(OldPattern), fd, D);
816 else bitap(old_D_pat, Pattern, fd, M, D);
817 }
818 if (COUNT) {
819 if(INVERSE && PAT_FILE) printf("%d\n", total_line-num_of_matched);
820 else printf("%d\n", num_of_matched);
821 }
822 }
823 else {
824 for (i = 0; i < Numfiles; i++, close(fd), num_of_matched = 0) {
825 strcpy(CurrentFileName, Textfiles[i]);
826 if ((fd = open(Textfiles[i], 0)) <= 0) {
827 fprintf(stderr, "%s: can't open file %s\n",Progname, Textfiles[i]);
828 }
829 else {
830 if(PAT_FILE) mgrep(fd);
831 else {
832 if(SGREP) sgrep(OldPattern, strlen(OldPattern), fd, D);
833 else bitap(old_D_pat, Pattern, fd, M, D);
834 }
835 if (num_of_matched) NOMATCH = OFF;
836 if (COUNT && !FILEOUT) {
837 if(INVERSE && PAT_FILE) {
838 if(FNAME) printf("%s: %d\n", CurrentFileName, total_line - num_of_matched);
839 else printf("%d\n", total_line - num_of_matched);
840 }
841 else {
842 if(FNAME) printf("%s: %d\n", CurrentFileName, num_of_matched);
843 else printf("%d\n", num_of_matched);
844 }
845 } /* if COUNT */
846 if(FILEOUT && num_of_matched) {
847 file_out(CurrentFileName);
848 }
849 } /* else */
850 } /* for i < Numfiles */
851 if(NOMATCH && BESTMATCH) {
852 if(WORDBOUND || WHOLELINE || LINENUM || INVERSE) {
853 SGREP = 0;
854 preprocess(D_pattern, Pattern);
855 strcpy(old_D_pat, D_pattern);
856 M = maskgen(Pattern, D);
857 }
858 COUNT=ON; D=1;
859 while(D<M && D<=MaxError && num_of_matched == 0) {
860 for (i = 0; i < Numfiles; i++, close(fd)) {
861 strcpy(CurrentFileName, Textfiles[i]);
862 if ((fd = open(Textfiles[i], 0)) > 0) {
863 if(PAT_FILE) mgrep(fd);
864 else {
865 if(SGREP) sgrep(OldPattern,strlen(OldPattern),fd,D);
866 else bitap(old_D_pat,Pattern,fd,M,D);
867 }
868 }
869 } /* for i < Numfiles */
870 D++;
871 } /* while */
872 if(num_of_matched > 0) {
873 D--; COUNT = 0;
874 if(NOPROMPT) goto GO_AHEAD;
875 if(D==1) fprintf(stderr, "best match has 1 error, ");
876 else fprintf(stderr, "best match has %d errors, ", D);
877 fflush(stderr);
878 if(num_of_matched == 1) fprintf(stderr,"there is 1 match, output it? (y/n)");
879 else fprintf(stderr,"there are %d matches, output them? (y/n)", num_of_matched);
880 scanf("%c",&c);
881 if(c != 'y') goto CONT;
882 GO_AHEAD:
883 for (i = 0; i < Numfiles; i++, close(fd)) {
884 strcpy(CurrentFileName, Textfiles[i]);
885 if ((fd = open(Textfiles[i], 0)) > 0) {
886 if(PAT_FILE) mgrep(fd);
887 else {
888 if(SGREP) sgrep(OldPattern,strlen(OldPattern),fd,D);
889 else bitap(old_D_pat,Pattern,fd,M,D);
890 }
891 }
892 } /* for i < Numfiles */
893 NOMATCH = 0;
894 }
895 }
896 }
897 CONT:
898 if(EATFIRST) {
899 printf("\n");
900 EATFIRST = OFF;
901 }
902 if(num_of_matched) NOMATCH = OFF;
903 if(NOMATCH) exit(1);
904 exit(0);
905 } /* end of main() */
906
907
file_out(fname)908 void file_out(fname)
909 char *fname;
910 {
911 int num_read;
912 int fd;
913 int i, len;
914 CHAR buf[4097];
915 if(FNAME) {
916 len = strlen(fname);
917 putchar('\n');
918 for(i=0; i< len; i++) putchar(':');
919 putchar('\n');
920 printf("%s\n", CurrentFileName);
921 len = strlen(fname);
922 for(i=0; i< len; i++) putchar(':');
923 putchar('\n');
924 fflush(stdout);
925 }
926 fd = open(fname, 0);
927 while((num_read = read(fd, buf, 4096)) > 0)
928 write(1, buf, num_read);
929 }
930
931
usage()932 void usage()
933 {
934 fprintf(stderr, "usage: %s [-#cdehiklnpstvwxBDGIS] [-f patternfile] pattern [files]\n", Progname);
935 printf("\n");
936 fprintf(stderr, "summary of frequently used options:\n");
937 fprintf(stderr, "-#: find matches with at most # errors\n");
938 fprintf(stderr, "-c: output the number of matched records\n");
939 fprintf(stderr, "-d: define record delimiter\n");
940 fprintf(stderr, "-h: do not output file names\n");
941 fprintf(stderr, "-i: case-insensitive search, e.g., 'a' = 'A'\n");
942 fprintf(stderr, "-l: output the names of files that contain a match\n");
943 fprintf(stderr, "-n: output record prefixed by record number\n");
944 fprintf(stderr, "-v: output those records containing no matches\n");
945 fprintf(stderr, "-w: pattern has to match as a word, e.g., 'win' will not match 'wind'\n");
946 fprintf(stderr, "-B: best match mode. find the closest matches to the pattern\n");
947 fprintf(stderr, "-G: output the files that contain a match\n");
948 printf("\n");
949
950 exit(2);
951 }
952
checksg(Pattern,D)953 void checksg(Pattern, D)
954 CHAR *Pattern; int D;
955 {
956 char c;
957 int i, m;
958 m = strlen(Pattern);
959 if(!(PAT_FILE) && m <= D) {
960 fprintf(stderr, "%s: size of pattern must be greater than number of errors\n", Progname);
961 exit(2);
962 }
963 SIMPLEPATTERN = ON;
964 for (i=0; i < m; i++)
965 {
966 switch(Pattern[i])
967 {
968 case ';' : SIMPLEPATTERN = OFF; break;
969 case ',' : SIMPLEPATTERN = OFF; break;
970 case '.' : SIMPLEPATTERN = OFF; break;
971 case '*' : SIMPLEPATTERN = OFF; break;
972 case '-' : SIMPLEPATTERN = OFF; break;
973 case '[' : SIMPLEPATTERN = OFF; break;
974 case ']' : SIMPLEPATTERN = OFF; break;
975 case '(' : SIMPLEPATTERN = OFF; break;
976 case ')' : SIMPLEPATTERN = OFF; break;
977 case '<' : SIMPLEPATTERN = OFF; break;
978 case '>' : SIMPLEPATTERN = OFF; break;
979 case '^' : if(D > 0) SIMPLEPATTERN = OFF;
980 break;
981 case '$' : if(D > 0) SIMPLEPATTERN = OFF;
982 break;
983 case '|' : SIMPLEPATTERN = OFF; break;
984 case '#' : SIMPLEPATTERN = OFF; break;
985 case '\\' : SIMPLEPATTERN = OFF; break;
986 default : break;
987 }
988 }
989 if (CONSTANT) SIMPLEPATTERN = ON;
990 if (SIMPLEPATTERN == OFF) return;
991 if (NOUPPER && D) return;
992 if (JUMP == ON) return;
993 if (I == 0) return;
994 if (LINENUM) return;
995 if (DELIMITER) return;
996 if (INVERSE) return;
997 if (WORDBOUND && D > 0) return;
998 if (WHOLELINE && D > 0) return;
999 if (SILENT) return; /* REMINDER: to be removed */
1000 SGREP = ON;
1001 if(m >= 16) DNA = ON;
1002 for(i=0; i<m; i++) {
1003 c = Pattern[i];
1004 if(c == 'a' || c == 'c' || c == 't' || c == 'g' ) ;
1005 else DNA = OFF;
1006 }
1007 return;
1008 }
1009
1010 void output (register CHAR *buffer, int i1, int i2, int j);
output(buffer,i1,i2,j)1011 void output (buffer, i1, i2, j)
1012 register CHAR *buffer; int i1, i2, j;
1013 {
1014 register CHAR *bp, *outend;
1015 if(i1 > i2) return;
1016 num_of_matched++;
1017 if(COUNT) return;
1018 if(SILENT) return;
1019 if(OUTTAIL) {
1020 i1 = i1 + D_length;
1021 i2 = i2 + D_length;
1022 }
1023 if(DELIMITER) j = j+1;
1024 if(FIRSTOUTPUT) {
1025 if (buffer[i1] == '\n') {
1026 i1++;
1027 EATFIRST = ON;
1028 }
1029 FIRSTOUTPUT = 0;
1030 }
1031 if(TRUNCATE) {
1032 fprintf(stderr, "WARNING!!! some lines have been truncated in output record #%d\n", num_of_matched-1);
1033 }
1034 while(buffer[i1] == '\n' && i1 <= i2) {
1035 printf("\n");
1036 i1++;
1037 }
1038 if(FNAME == ON) printf("%s: ", CurrentFileName);
1039 if(LINENUM) printf("%d: ", j-1);
1040 bp = buffer + i1;
1041 outend = buffer + i2;
1042 while(bp <= outend) putchar(*bp++);
1043 }
1044
1045 /* end of main.c */
1046