1 /* search.c
2  */
3 
4 /* string search routines */
5 
6 /*		Copyright (c) 1981,1980 James Gosling		*/
7 
8 /* Modified Aug. 12, 1981 by Tom London to include regular expressions
9    as in ed.  RE stuff hacked over by jag to correct a few major problems,
10    mainly dealing with searching within the buffer rather than copying
11    each line to a separate array.  Newlines can now appear in RE's */
12 
13 /* Ripped to shreds and glued back together to make a search package,
14  * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
15  * Changes include:
16  *	Buffer, window, and mlisp stuff gone.
17  *	Translation tables reduced to 1 table.
18  *	Expression buffer is now dynamically allocated.
19  *	Character classes now implemented with a bitmap.
20  * Modified by David Canzi, Apr 1997:
21  *	Check bounds on alternatives array.
22  *	Correct spurious matching, eg. /: re: .*\bfoo/ matched ": re: bar".
23  */
24 
25 #include "EXTERN.h"
26 #include "common.h"
27 #include "util.h"
28 #include "util2.h"
29 #include "INTERN.h"
30 #include "search.h"
31 
32 #ifndef BITSPERBYTE
33 #define BITSPERBYTE 8
34 #endif
35 
36 #define BMAPSIZ (127 / BITSPERBYTE + 1)
37 
38 #define MNULL	64		/* Bit is set in a meta-character defn to
39 				   indicate that the metacharacter can match
40 				   a null string.  advance() uses this. */
41 
42 /* meta characters in the "compiled" form of a regular expression */
43 #define CBRA	(2|MNULL)	/* \( -- begin bracket */
44 #define	CCHR	4		/* a vanilla character */
45 #define	CDOT	6		/* . -- match anything except a newline */
46 #define	CCL	8		/* [...] -- character class */
47 #define	NCCL	10		/* [^...] -- negated character class */
48 #define CDOL	(12|MNULL)	/* $ -- matches the end of a line */
49 #define CEND	(14|MNULL)	/* The end of the pattern */
50 #define CKET	(16|MNULL)	/* \) -- close bracket */
51 #define CBACK	(18|MNULL)	/* \N -- backreference to the Nth bracketed
52 				   string */
53 #define CIRC	(20|MNULL)	/* ^ matches the beginning of a line */
54 
55 #define WORD	32		/* matches word character \w */
56 #define NWORD	34		/* matches non-word characer \W */
57 #define WBOUND	(36|MNULL)	/* matches word boundary \b */
58 #define NWBOUND	(38|MNULL)	/* matches non-(word boundary) \B */
59 
60 #define	STAR	01		/* * -- Kleene star, repeats the previous
61 				   REas many times as possible; the value
62 				   ORs with the other operator types */
63 
64 #define ASCSIZ 256
65 typedef Uchar	TRANSTABLE[ASCSIZ];
66 
67 static TRANSTABLE trans;
68 static bool folding = FALSE;
69 
70 static int err;
71 static char* FirstCharacter;
72 
73 void
search_init()74 search_init()
75 {
76     register int    i;
77 
78     for (i = 0; i < ASCSIZ; i++)
79 	trans[i] = i;
80 }
81 
82 void
init_compex(compex)83 init_compex(compex)
84 register COMPEX* compex;
85 {
86     /* the following must start off zeroed */
87 
88     compex->eblen = 0;
89     compex->brastr = NULL;
90 }
91 
92 void
free_compex(compex)93 free_compex(compex)
94 register COMPEX* compex;
95 {
96     if (compex->eblen) {
97 	free(compex->expbuf);
98 	compex->eblen = 0;
99     }
100     if (compex->brastr) {
101 	free(compex->brastr);
102 	compex->brastr = NULL;
103     }
104 }
105 
106 static char* gbr_str = NULL;
107 static int gbr_siz = 0;
108 
109 char*
getbracket(compex,n)110 getbracket(compex,n)
111 register COMPEX* compex;
112 int n;
113 {
114     int length = compex->braelist[n] - compex->braslist[n];
115 
116     if (!compex->nbra)
117 	return NULL;
118     if (n > compex->nbra || !compex->braelist[n] || length < 0)
119 	return nullstr;
120     growstr(&gbr_str, &gbr_siz, length+1);
121     safecpy(gbr_str, compex->braslist[n], length+1);
122     return gbr_str;
123 }
124 
125 void
case_fold(which)126 case_fold(which)
127 int which;
128 {
129     register int i;
130 
131     if (which != folding) {
132 	if (which) {
133 	    for (i = 'A'; i <= 'Z'; i++)
134 		trans[i] = tolower(i);
135 	}
136 	else {
137 	    for (i = 'A'; i <= 'Z'; i++)
138 		trans[i] = i;
139 	}
140 	folding = which;
141     }
142 }
143 
144 /* Compile the given regular expression into a [secret] internal format */
145 
146 char*
compile(compex,strp,RE,fold)147 compile(compex, strp, RE, fold)
148 register COMPEX* compex;
149 register char* strp;
150 int RE;
151 int fold;
152 {
153     register int c;
154     register char* ep;
155     char* lastep;
156     char  bracket[NBRA];
157     char* bracketp;
158     char** alt = compex->alternatives;
159     char* retmes = "Badly formed search string";
160 
161     case_fold(compex->do_folding = fold);
162     if (!compex->eblen) {
163 	compex->expbuf = safemalloc(84);
164 	compex->eblen = 80;
165     }
166     ep = compex->expbuf;		/* point at expression buffer */
167     *alt++ = ep;			/* first alternative starts here */
168     bracketp = bracket;			/* first bracket goes here */
169     if (*strp == 0) {			/* nothing to compile? */
170 	if (*ep == 0)			/* nothing there yet? */
171 	    return "Null search string";
172 	return NULL;			/* just keep old expression */
173     }
174     compex->nbra = 0;			/* no brackets yet */
175     lastep = 0;
176     for (;;) {
177 	if (ep + 4 - compex->expbuf >= compex->eblen)
178 	    ep = grow_eb(compex, ep, alt);
179 	c = *strp++;			/* fetch next char of pattern */
180 	if (c == 0) {			/* end of pattern? */
181 	    if (bracketp != bracket) {	/* balanced brackets? */
182 #ifdef VERBOSE
183 		retmes = "Unbalanced parens";
184 #endif
185 		goto cerror;
186 	    }
187 	    *ep++ = CEND;		/* terminate expression */
188 	    *alt++ = 0;			/* terminal alternative list */
189 	    return NULL;		/* return success */
190 	}
191 	if (c != '*')
192 	    lastep = ep;
193 	if (!RE) {			/* just a normal search string? */
194 	    *ep++ = CCHR;		/* everything is a normal char */
195 	    *ep++ = c;
196 	}
197 	else				/* it is a regular expression */
198 	    switch (c) {
199 
200 		case '\\':		/* meta something */
201 		    switch (c = *strp++) {
202 		    case '(':
203 			if (compex->nbra >= NBRA) {
204 #ifdef VERBOSE
205 			    retmes = "Too many parens";
206 #endif
207 			    goto cerror;
208 			}
209 			*bracketp++ = ++compex->nbra;
210 			*ep++ = CBRA;
211 			*ep++ = compex->nbra;
212 			break;
213 		    case '|':
214 			if (bracketp>bracket) {
215 #ifdef VERBOSE
216 			    retmes = "No \\| in parens";	/* Alas! */
217 #endif
218 			    goto cerror;
219 			}
220 			*ep++ = CEND;
221 			*alt++ = ep;
222 			if (alt > compex->alternatives + NALTS) {
223 #ifdef VERBOSE
224 				retmes = "Too many alternatives in reg ex";
225 #endif
226 				goto cerror;
227 			}
228 			break;
229 		    case ')':
230 			if (bracketp <= bracket) {
231 #ifdef VERBOSE
232 			    retmes = "Unmatched right paren";
233 #endif
234 			    goto cerror;
235 			}
236 			*ep++ = CKET;
237 			*ep++ = *--bracketp;
238 			break;
239 		    case 'w':
240 			*ep++ = WORD;
241 			break;
242 		    case 'W':
243 			*ep++ = NWORD;
244 			break;
245 		    case 'b':
246 			*ep++ = WBOUND;
247 			break;
248 		    case 'B':
249 			*ep++ = NWBOUND;
250 			break;
251 		    case '0': case '1': case '2': case '3': case '4':
252 		    case '5': case '6': case '7': case '8': case '9':
253 			*ep++ = CBACK;
254 			*ep++ = c - '0';
255 			break;
256 		    default:
257 			*ep++ = CCHR;
258 			if (c == '\0')
259 			    goto cerror;
260 			*ep++ = c;
261 			break;
262 		    }
263 		    break;
264 		case '.':
265 		    *ep++ = CDOT;
266 		    continue;
267 
268 		case '*':
269 		    if (lastep == 0 || *lastep == CBRA || *lastep == CKET
270 			|| *lastep == CIRC
271 			|| (*lastep&STAR)|| *lastep>NWORD)
272 			goto defchar;
273 		    *lastep |= STAR;
274 		    continue;
275 
276 		case '^':
277 		    if (ep != compex->expbuf && ep[-1] != CEND)
278 			goto defchar;
279 		    *ep++ = CIRC;
280 		    continue;
281 
282 		case '$':
283 		    if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
284 			goto defchar;
285 		    *ep++ = CDOL;
286 		    continue;
287 
288 		case '[': {		/* character class */
289 		    register int i;
290 
291 		    if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
292 			ep = grow_eb(compex, ep, alt); /* reserve bitmap */
293 
294 		    for (i = BMAPSIZ; i; --i)
295 			ep[i] = 0;
296 
297 		    if ((c = *strp++) == '^') {
298 			c = *strp++;
299 			*ep++ = NCCL;	/* negated */
300 		    }
301 		    else
302 			*ep++ = CCL;	/* normal */
303 
304 		    i = 0;		/* remember oldchar */
305 		    do {
306 			if (c == '\0') {
307 #ifdef VERBOSE
308 			    retmes = "Missing ]";
309 #endif
310 			    goto cerror;
311 			}
312 			if (*strp == '-' && *(++strp) != ']' && *strp)
313 			    i = *strp++;
314 			else
315 			    i = c;
316 			while (c <= i) {
317 			    ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
318 			    if (fold && isalpha(c))
319 				ep[(c ^ 32) / BITSPERBYTE] |=
320 				    1 << ((c ^ 32) % BITSPERBYTE);
321 					/* set the other bit too */
322 			    c++;
323 			}
324 		    } while ((c = *strp++) != ']');
325 		    ep += BMAPSIZ;
326 		    continue;
327 		}
328 
329 	    defchar:
330 		default:
331 		    *ep++ = CCHR;
332 		    *ep++ = c;
333 	    }
334     }
335 cerror:
336     compex->expbuf[0] = 0;
337     compex->nbra = 0;
338     return retmes;
339 }
340 
341 char*
grow_eb(compex,epp,alt)342 grow_eb(compex, epp, alt)
343 register COMPEX* compex;
344 char* epp;
345 char** alt;
346 {
347     register char* oldbuf = compex->expbuf;
348     register char** altlist = compex->alternatives;
349 
350     compex->eblen += 80;
351     compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
352     if (compex->expbuf != oldbuf) {	/* realloc can change expbuf! */
353 	epp += compex->expbuf - oldbuf;
354 	while (altlist != alt)
355 	    *altlist++ += compex->expbuf - oldbuf;
356     }
357     return epp;
358 }
359 
360 char*
execute(compex,addr)361 execute(compex, addr)
362 register COMPEX* compex;
363 char* addr;
364 {
365     register char* p1 = addr;
366     register Uchar* trt = trans;
367     register int c;
368 
369     if (addr == NULL || compex->expbuf == NULL)
370 	return NULL;
371     if (compex->nbra) {			/* any brackets? */
372 	for (c = 0; c <= compex->nbra; c++)
373 	    compex->braslist[c] = compex->braelist[c] = NULL;
374 	if (compex->brastr)
375 	    free(compex->brastr);
376 	compex->brastr = savestr(p1);	/* in case p1 is not static */
377 	p1 = compex->brastr;		/* ! */
378     }
379     case_fold(compex->do_folding);	/* make sure table is correct */
380     FirstCharacter = p1;		/* for ^ tests */
381     if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
382 	c = trt[*(Uchar*)(compex->expbuf+1)]; /* fast check for first char */
383 	do {
384 	    if (trt[*(Uchar*)p1] == c && advance(compex, p1, compex->expbuf))
385 		return p1;
386 	    p1++;
387 	} while (*p1 && !err);
388 	if (err) err = 0;
389 	return NULL;
390     }
391     else {			/* regular algorithm */
392 	do {
393 	    register char** alt = compex->alternatives;
394 	    while (*alt) {
395 		if (advance(compex, p1, *alt++))
396 		    return p1;
397 	    }
398 	    p1++;
399 	} while (*p1 && !err);
400 	if (err) err = 0;
401 	return NULL;
402     }
403    /*NOTREACHED*/
404 }
405 
406 /* advance the match of the regular expression starting at ep along the
407    string lp, simulates an NDFSA */
408 bool
advance(compex,lp,ep)409 advance(compex, lp, ep)
410 register COMPEX* compex;
411 register char* ep;
412 register char* lp;
413 {
414     register char* curlp;
415     register Uchar* trt = trans;
416     register int i;
417 
418     while (*lp || (*ep & (STAR|MNULL))) {
419 	switch (*ep++) {
420 
421 	    case CCHR:
422 		if (trt[*(Uchar*)ep++] != trt[*(Uchar*)lp])
423 		    return FALSE;
424 		lp++;
425 		continue;
426 
427 	    case CDOT:
428 		if (*lp == '\n') return FALSE;
429 		lp++;
430 		continue;
431 
432 	    case CDOL:
433 		if (!*lp || *lp == '\n')
434 		    continue;
435 		return FALSE;
436 
437 	    case CIRC:
438 		if (lp == FirstCharacter || lp[-1]=='\n')
439 		    continue;
440 		return FALSE;
441 
442 	    case WORD:
443 		if (isalnum(*lp)) {
444 		    lp++;
445 		    continue;
446 		}
447 		return FALSE;
448 
449 	    case NWORD:
450 		if (!isalnum(*lp)) {
451 		    lp++;
452 		    continue;
453 		}
454 		return FALSE;
455 
456 	    case WBOUND:
457 		if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
458 			(!*lp || !isalnum(*lp)) )
459 		    continue;
460 		return FALSE;
461 
462 	    case NWBOUND:
463 		if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
464 			(!*lp || !isalnum(*lp)))
465 		    continue;
466 		return FALSE;
467 
468 	    case CEND:
469 		return TRUE;
470 
471 	    case CCL:
472 		if (cclass(ep, *lp, 1)) {
473 		    ep += BMAPSIZ;
474 		    lp++;
475 		    continue;
476 		}
477 		return FALSE;
478 
479 	    case NCCL:
480 		if (cclass(ep, *lp, 0)) {
481 		    ep += BMAPSIZ;
482 		    lp++;
483 		    continue;
484 		}
485 		return FALSE;
486 
487 	    case CBRA:
488 		compex->braslist[(unsigned char)*ep++] = lp;
489 		continue;
490 
491 	    case CKET:
492 		i = *ep++;
493 		compex->braelist[i] = lp;
494 		compex->braelist[0] = lp;
495 		compex->braslist[0] = compex->braslist[i];
496 		continue;
497 
498 	    case CBACK:
499 		if (compex->braelist[i = *ep++] == 0) {
500 		    fputs("bad braces\n",stdout) FLUSH;
501 		    err = TRUE;
502 		    return FALSE;
503 		}
504 		if (backref(compex, i, lp)) {
505 		    lp += compex->braelist[i] - compex->braslist[i];
506 		    continue;
507 		}
508 		return FALSE;
509 
510 	    case CBACK | STAR:
511 		if (compex->braelist[i = *ep++] == 0) {
512 		    fputs("bad braces\n",stdout) FLUSH;
513 		    err = TRUE;
514 		    return FALSE;
515 		}
516 		curlp = lp;
517 		while (backref(compex, i, lp)) {
518 		    lp += compex->braelist[i] - compex->braslist[i];
519 		}
520 		while (lp >= curlp) {
521 		    if (advance(compex, lp, ep))
522 			return TRUE;
523 		    lp -= compex->braelist[i] - compex->braslist[i];
524 		}
525 		continue;
526 
527 	    case CDOT | STAR:
528 		curlp = lp;
529 		while (*lp++ && lp[-1] != '\n') ;
530 		goto star;
531 
532 	    case WORD | STAR:
533 		curlp = lp;
534 		while (*lp++ && isalnum(lp[-1])) ;
535 		goto star;
536 
537 	    case NWORD | STAR:
538 		curlp = lp;
539 		while (*lp++ && !isalnum(lp[-1])) ;
540 		goto star;
541 
542 	    case CCHR | STAR:
543 		curlp = lp;
544 		while (*lp++ && trt[*(Uchar*)(lp-1)] == trt[*(Uchar*)ep]) ;
545 		ep++;
546 		goto star;
547 
548 	    case CCL | STAR:
549 	    case NCCL | STAR:
550 		curlp = lp;
551 		while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR))) ;
552 		ep += BMAPSIZ;
553 		goto star;
554 
555 	star:
556 		do {
557 		    lp--;
558 		    if (advance(compex, lp, ep))
559 			return TRUE;
560 		} while (lp > curlp);
561 		return FALSE;
562 
563 	    default:
564 		fputs("Badly compiled pattern\n",stdout) FLUSH;
565 		err = TRUE;
566 		return -1;
567 	}
568     }
569     return FALSE;
570 }
571 
572 bool
backref(compex,i,lp)573 backref(compex, i, lp)
574 register COMPEX* compex;
575 register int i;
576 register char* lp;
577 {
578     register char* bp;
579 
580     bp = compex->braslist[i];
581     while (*lp && *bp == *lp) {
582 	bp++;
583 	lp++;
584 	if (bp >= compex->braelist[i])
585 	    return TRUE;
586     }
587     return FALSE;
588 }
589 
590 bool
cclass(set,c,af)591 cclass(set, c, af)
592 register char* set;
593 register int c;
594 int af;
595 {
596     c &= 0177;
597 #if BITSPERBYTE == 8
598     if (set[c >> 3] & 1 << (c & 7))
599 #else
600     if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
601 #endif
602 	return af;
603     return !af;
604 }
605