1 /* search.c
2 */
3
4 /* string search routines */
5
6 /* Copyright (c) 1981,1980 James Gosling */
7
8 /* Modified Aug. 12, 1981 by Tom London to include regular expressions
9 as in ed. RE stuff hacked over by jag to correct a few major problems,
10 mainly dealing with searching within the buffer rather than copying
11 each line to a separate array. Newlines can now appear in RE's */
12
13 /* Ripped to shreds and glued back together to make a search package,
14 * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
15 * Changes include:
16 * Buffer, window, and mlisp stuff gone.
17 * Translation tables reduced to 1 table.
18 * Expression buffer is now dynamically allocated.
19 * Character classes now implemented with a bitmap.
20 * Modified by David Canzi, Apr 1997:
21 * Check bounds on alternatives array.
22 * Correct spurious matching, eg. /: re: .*\bfoo/ matched ": re: bar".
23 */
24
25 #include "EXTERN.h"
26 #include "common.h"
27 #include "util.h"
28 #include "util2.h"
29 #include "INTERN.h"
30 #include "search.h"
31
32 #ifndef BITSPERBYTE
33 #define BITSPERBYTE 8
34 #endif
35
36 #define BMAPSIZ (127 / BITSPERBYTE + 1)
37
38 #define MNULL 64 /* Bit is set in a meta-character defn to
39 indicate that the metacharacter can match
40 a null string. advance() uses this. */
41
42 /* meta characters in the "compiled" form of a regular expression */
43 #define CBRA (2|MNULL) /* \( -- begin bracket */
44 #define CCHR 4 /* a vanilla character */
45 #define CDOT 6 /* . -- match anything except a newline */
46 #define CCL 8 /* [...] -- character class */
47 #define NCCL 10 /* [^...] -- negated character class */
48 #define CDOL (12|MNULL) /* $ -- matches the end of a line */
49 #define CEND (14|MNULL) /* The end of the pattern */
50 #define CKET (16|MNULL) /* \) -- close bracket */
51 #define CBACK (18|MNULL) /* \N -- backreference to the Nth bracketed
52 string */
53 #define CIRC (20|MNULL) /* ^ matches the beginning of a line */
54
55 #define WORD 32 /* matches word character \w */
56 #define NWORD 34 /* matches non-word characer \W */
57 #define WBOUND (36|MNULL) /* matches word boundary \b */
58 #define NWBOUND (38|MNULL) /* matches non-(word boundary) \B */
59
60 #define STAR 01 /* * -- Kleene star, repeats the previous
61 REas many times as possible; the value
62 ORs with the other operator types */
63
64 #define ASCSIZ 256
65 typedef Uchar TRANSTABLE[ASCSIZ];
66
67 static TRANSTABLE trans;
68 static bool folding = FALSE;
69
70 static int err;
71 static char* FirstCharacter;
72
73 void
search_init()74 search_init()
75 {
76 register int i;
77
78 for (i = 0; i < ASCSIZ; i++)
79 trans[i] = i;
80 }
81
82 void
init_compex(compex)83 init_compex(compex)
84 register COMPEX* compex;
85 {
86 /* the following must start off zeroed */
87
88 compex->eblen = 0;
89 compex->brastr = NULL;
90 }
91
92 void
free_compex(compex)93 free_compex(compex)
94 register COMPEX* compex;
95 {
96 if (compex->eblen) {
97 free(compex->expbuf);
98 compex->eblen = 0;
99 }
100 if (compex->brastr) {
101 free(compex->brastr);
102 compex->brastr = NULL;
103 }
104 }
105
106 static char* gbr_str = NULL;
107 static int gbr_siz = 0;
108
109 char*
getbracket(compex,n)110 getbracket(compex,n)
111 register COMPEX* compex;
112 int n;
113 {
114 int length = compex->braelist[n] - compex->braslist[n];
115
116 if (!compex->nbra)
117 return NULL;
118 if (n > compex->nbra || !compex->braelist[n] || length < 0)
119 return nullstr;
120 growstr(&gbr_str, &gbr_siz, length+1);
121 safecpy(gbr_str, compex->braslist[n], length+1);
122 return gbr_str;
123 }
124
125 void
case_fold(which)126 case_fold(which)
127 int which;
128 {
129 register int i;
130
131 if (which != folding) {
132 if (which) {
133 for (i = 'A'; i <= 'Z'; i++)
134 trans[i] = tolower(i);
135 }
136 else {
137 for (i = 'A'; i <= 'Z'; i++)
138 trans[i] = i;
139 }
140 folding = which;
141 }
142 }
143
144 /* Compile the given regular expression into a [secret] internal format */
145
146 char*
compile(compex,strp,RE,fold)147 compile(compex, strp, RE, fold)
148 register COMPEX* compex;
149 register char* strp;
150 int RE;
151 int fold;
152 {
153 register int c;
154 register char* ep;
155 char* lastep;
156 char bracket[NBRA];
157 char* bracketp;
158 char** alt = compex->alternatives;
159 char* retmes = "Badly formed search string";
160
161 case_fold(compex->do_folding = fold);
162 if (!compex->eblen) {
163 compex->expbuf = safemalloc(84);
164 compex->eblen = 80;
165 }
166 ep = compex->expbuf; /* point at expression buffer */
167 *alt++ = ep; /* first alternative starts here */
168 bracketp = bracket; /* first bracket goes here */
169 if (*strp == 0) { /* nothing to compile? */
170 if (*ep == 0) /* nothing there yet? */
171 return "Null search string";
172 return NULL; /* just keep old expression */
173 }
174 compex->nbra = 0; /* no brackets yet */
175 lastep = 0;
176 for (;;) {
177 if (ep + 4 - compex->expbuf >= compex->eblen)
178 ep = grow_eb(compex, ep, alt);
179 c = *strp++; /* fetch next char of pattern */
180 if (c == 0) { /* end of pattern? */
181 if (bracketp != bracket) { /* balanced brackets? */
182 #ifdef VERBOSE
183 retmes = "Unbalanced parens";
184 #endif
185 goto cerror;
186 }
187 *ep++ = CEND; /* terminate expression */
188 *alt++ = 0; /* terminal alternative list */
189 return NULL; /* return success */
190 }
191 if (c != '*')
192 lastep = ep;
193 if (!RE) { /* just a normal search string? */
194 *ep++ = CCHR; /* everything is a normal char */
195 *ep++ = c;
196 }
197 else /* it is a regular expression */
198 switch (c) {
199
200 case '\\': /* meta something */
201 switch (c = *strp++) {
202 case '(':
203 if (compex->nbra >= NBRA) {
204 #ifdef VERBOSE
205 retmes = "Too many parens";
206 #endif
207 goto cerror;
208 }
209 *bracketp++ = ++compex->nbra;
210 *ep++ = CBRA;
211 *ep++ = compex->nbra;
212 break;
213 case '|':
214 if (bracketp>bracket) {
215 #ifdef VERBOSE
216 retmes = "No \\| in parens"; /* Alas! */
217 #endif
218 goto cerror;
219 }
220 *ep++ = CEND;
221 *alt++ = ep;
222 if (alt > compex->alternatives + NALTS) {
223 #ifdef VERBOSE
224 retmes = "Too many alternatives in reg ex";
225 #endif
226 goto cerror;
227 }
228 break;
229 case ')':
230 if (bracketp <= bracket) {
231 #ifdef VERBOSE
232 retmes = "Unmatched right paren";
233 #endif
234 goto cerror;
235 }
236 *ep++ = CKET;
237 *ep++ = *--bracketp;
238 break;
239 case 'w':
240 *ep++ = WORD;
241 break;
242 case 'W':
243 *ep++ = NWORD;
244 break;
245 case 'b':
246 *ep++ = WBOUND;
247 break;
248 case 'B':
249 *ep++ = NWBOUND;
250 break;
251 case '0': case '1': case '2': case '3': case '4':
252 case '5': case '6': case '7': case '8': case '9':
253 *ep++ = CBACK;
254 *ep++ = c - '0';
255 break;
256 default:
257 *ep++ = CCHR;
258 if (c == '\0')
259 goto cerror;
260 *ep++ = c;
261 break;
262 }
263 break;
264 case '.':
265 *ep++ = CDOT;
266 continue;
267
268 case '*':
269 if (lastep == 0 || *lastep == CBRA || *lastep == CKET
270 || *lastep == CIRC
271 || (*lastep&STAR)|| *lastep>NWORD)
272 goto defchar;
273 *lastep |= STAR;
274 continue;
275
276 case '^':
277 if (ep != compex->expbuf && ep[-1] != CEND)
278 goto defchar;
279 *ep++ = CIRC;
280 continue;
281
282 case '$':
283 if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
284 goto defchar;
285 *ep++ = CDOL;
286 continue;
287
288 case '[': { /* character class */
289 register int i;
290
291 if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
292 ep = grow_eb(compex, ep, alt); /* reserve bitmap */
293
294 for (i = BMAPSIZ; i; --i)
295 ep[i] = 0;
296
297 if ((c = *strp++) == '^') {
298 c = *strp++;
299 *ep++ = NCCL; /* negated */
300 }
301 else
302 *ep++ = CCL; /* normal */
303
304 i = 0; /* remember oldchar */
305 do {
306 if (c == '\0') {
307 #ifdef VERBOSE
308 retmes = "Missing ]";
309 #endif
310 goto cerror;
311 }
312 if (*strp == '-' && *(++strp) != ']' && *strp)
313 i = *strp++;
314 else
315 i = c;
316 while (c <= i) {
317 ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
318 if (fold && isalpha(c))
319 ep[(c ^ 32) / BITSPERBYTE] |=
320 1 << ((c ^ 32) % BITSPERBYTE);
321 /* set the other bit too */
322 c++;
323 }
324 } while ((c = *strp++) != ']');
325 ep += BMAPSIZ;
326 continue;
327 }
328
329 defchar:
330 default:
331 *ep++ = CCHR;
332 *ep++ = c;
333 }
334 }
335 cerror:
336 compex->expbuf[0] = 0;
337 compex->nbra = 0;
338 return retmes;
339 }
340
341 char*
grow_eb(compex,epp,alt)342 grow_eb(compex, epp, alt)
343 register COMPEX* compex;
344 char* epp;
345 char** alt;
346 {
347 register char* oldbuf = compex->expbuf;
348 register char** altlist = compex->alternatives;
349
350 compex->eblen += 80;
351 compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
352 if (compex->expbuf != oldbuf) { /* realloc can change expbuf! */
353 epp += compex->expbuf - oldbuf;
354 while (altlist != alt)
355 *altlist++ += compex->expbuf - oldbuf;
356 }
357 return epp;
358 }
359
360 char*
execute(compex,addr)361 execute(compex, addr)
362 register COMPEX* compex;
363 char* addr;
364 {
365 register char* p1 = addr;
366 register Uchar* trt = trans;
367 register int c;
368
369 if (addr == NULL || compex->expbuf == NULL)
370 return NULL;
371 if (compex->nbra) { /* any brackets? */
372 for (c = 0; c <= compex->nbra; c++)
373 compex->braslist[c] = compex->braelist[c] = NULL;
374 if (compex->brastr)
375 free(compex->brastr);
376 compex->brastr = savestr(p1); /* in case p1 is not static */
377 p1 = compex->brastr; /* ! */
378 }
379 case_fold(compex->do_folding); /* make sure table is correct */
380 FirstCharacter = p1; /* for ^ tests */
381 if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
382 c = trt[*(Uchar*)(compex->expbuf+1)]; /* fast check for first char */
383 do {
384 if (trt[*(Uchar*)p1] == c && advance(compex, p1, compex->expbuf))
385 return p1;
386 p1++;
387 } while (*p1 && !err);
388 if (err) err = 0;
389 return NULL;
390 }
391 else { /* regular algorithm */
392 do {
393 register char** alt = compex->alternatives;
394 while (*alt) {
395 if (advance(compex, p1, *alt++))
396 return p1;
397 }
398 p1++;
399 } while (*p1 && !err);
400 if (err) err = 0;
401 return NULL;
402 }
403 /*NOTREACHED*/
404 }
405
406 /* advance the match of the regular expression starting at ep along the
407 string lp, simulates an NDFSA */
408 bool
advance(compex,lp,ep)409 advance(compex, lp, ep)
410 register COMPEX* compex;
411 register char* ep;
412 register char* lp;
413 {
414 register char* curlp;
415 register Uchar* trt = trans;
416 register int i;
417
418 while (*lp || (*ep & (STAR|MNULL))) {
419 switch (*ep++) {
420
421 case CCHR:
422 if (trt[*(Uchar*)ep++] != trt[*(Uchar*)lp])
423 return FALSE;
424 lp++;
425 continue;
426
427 case CDOT:
428 if (*lp == '\n') return FALSE;
429 lp++;
430 continue;
431
432 case CDOL:
433 if (!*lp || *lp == '\n')
434 continue;
435 return FALSE;
436
437 case CIRC:
438 if (lp == FirstCharacter || lp[-1]=='\n')
439 continue;
440 return FALSE;
441
442 case WORD:
443 if (isalnum(*lp)) {
444 lp++;
445 continue;
446 }
447 return FALSE;
448
449 case NWORD:
450 if (!isalnum(*lp)) {
451 lp++;
452 continue;
453 }
454 return FALSE;
455
456 case WBOUND:
457 if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
458 (!*lp || !isalnum(*lp)) )
459 continue;
460 return FALSE;
461
462 case NWBOUND:
463 if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
464 (!*lp || !isalnum(*lp)))
465 continue;
466 return FALSE;
467
468 case CEND:
469 return TRUE;
470
471 case CCL:
472 if (cclass(ep, *lp, 1)) {
473 ep += BMAPSIZ;
474 lp++;
475 continue;
476 }
477 return FALSE;
478
479 case NCCL:
480 if (cclass(ep, *lp, 0)) {
481 ep += BMAPSIZ;
482 lp++;
483 continue;
484 }
485 return FALSE;
486
487 case CBRA:
488 compex->braslist[(unsigned char)*ep++] = lp;
489 continue;
490
491 case CKET:
492 i = *ep++;
493 compex->braelist[i] = lp;
494 compex->braelist[0] = lp;
495 compex->braslist[0] = compex->braslist[i];
496 continue;
497
498 case CBACK:
499 if (compex->braelist[i = *ep++] == 0) {
500 fputs("bad braces\n",stdout) FLUSH;
501 err = TRUE;
502 return FALSE;
503 }
504 if (backref(compex, i, lp)) {
505 lp += compex->braelist[i] - compex->braslist[i];
506 continue;
507 }
508 return FALSE;
509
510 case CBACK | STAR:
511 if (compex->braelist[i = *ep++] == 0) {
512 fputs("bad braces\n",stdout) FLUSH;
513 err = TRUE;
514 return FALSE;
515 }
516 curlp = lp;
517 while (backref(compex, i, lp)) {
518 lp += compex->braelist[i] - compex->braslist[i];
519 }
520 while (lp >= curlp) {
521 if (advance(compex, lp, ep))
522 return TRUE;
523 lp -= compex->braelist[i] - compex->braslist[i];
524 }
525 continue;
526
527 case CDOT | STAR:
528 curlp = lp;
529 while (*lp++ && lp[-1] != '\n') ;
530 goto star;
531
532 case WORD | STAR:
533 curlp = lp;
534 while (*lp++ && isalnum(lp[-1])) ;
535 goto star;
536
537 case NWORD | STAR:
538 curlp = lp;
539 while (*lp++ && !isalnum(lp[-1])) ;
540 goto star;
541
542 case CCHR | STAR:
543 curlp = lp;
544 while (*lp++ && trt[*(Uchar*)(lp-1)] == trt[*(Uchar*)ep]) ;
545 ep++;
546 goto star;
547
548 case CCL | STAR:
549 case NCCL | STAR:
550 curlp = lp;
551 while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR))) ;
552 ep += BMAPSIZ;
553 goto star;
554
555 star:
556 do {
557 lp--;
558 if (advance(compex, lp, ep))
559 return TRUE;
560 } while (lp > curlp);
561 return FALSE;
562
563 default:
564 fputs("Badly compiled pattern\n",stdout) FLUSH;
565 err = TRUE;
566 return -1;
567 }
568 }
569 return FALSE;
570 }
571
572 bool
backref(compex,i,lp)573 backref(compex, i, lp)
574 register COMPEX* compex;
575 register int i;
576 register char* lp;
577 {
578 register char* bp;
579
580 bp = compex->braslist[i];
581 while (*lp && *bp == *lp) {
582 bp++;
583 lp++;
584 if (bp >= compex->braelist[i])
585 return TRUE;
586 }
587 return FALSE;
588 }
589
590 bool
cclass(set,c,af)591 cclass(set, c, af)
592 register char* set;
593 register int c;
594 int af;
595 {
596 c &= 0177;
597 #if BITSPERBYTE == 8
598 if (set[c >> 3] & 1 << (c & 7))
599 #else
600 if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
601 #endif
602 return af;
603 return !af;
604 }
605