1 /* $Id: search.c,v 3.0 1992/02/01 03:09:32 davison Trn $
2  */
3 
4 /* string search routines */
5 
6 /*		Copyright (c) 1981,1980 James Gosling		*/
7 
8 /* Modified Aug. 12, 1981 by Tom London to include regular expressions
9    as in ed.  RE stuff hacked over by jag to correct a few major problems,
10    mainly dealing with searching within the buffer rather than copying
11    each line to a separate array.  Newlines can now appear in RE's */
12 
13 /* Ripped to shreds and glued back together to make a search package,
14  * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
15  * Changes include:
16  *	Buffer, window, and mlisp stuff gone.
17  *	Translation tables reduced to 1 table.
18  *	Expression buffer is now dynamically allocated.
19  *	Character classes now implemented with a bitmap.
20  */
21 
22 #include "EXTERN.h"
23 #include "common.h"
24 #include "util.h"
25 #include "INTERN.h"
26 #include "search.h"
27 
28 #ifndef BITSPERBYTE
29 #define BITSPERBYTE 8
30 #endif
31 
32 #define BMAPSIZ (127 / BITSPERBYTE + 1)
33 
34 /* meta characters in the "compiled" form of a regular expression */
35 #define	CBRA	2		/* \( -- begin bracket */
36 #define	CCHR	4		/* a vanilla character */
37 #define	CDOT	6		/* . -- match anything except a newline */
38 #define	CCL	8		/* [...] -- character class */
39 #define	NCCL	10		/* [^...] -- negated character class */
40 #define	CDOL	12		/* $ -- matches the end of a line */
41 #define	CEND	14		/* The end of the pattern */
42 #define	CKET	16		/* \) -- close bracket */
43 #define	CBACK	18		/* \N -- backreference to the Nth bracketed
44 				   string */
45 #define CIRC	20		/* ^ matches the beginning of a line */
46 
47 #define WORD	32		/* matches word character \w */
48 #define NWORD	34		/* matches non-word characer \W */
49 #define WBOUND	36		/* matches word boundary \b */
50 #define NWBOUND	38		/* matches non-(word boundary) \B */
51 
52 #define	STAR	01		/* * -- Kleene star, repeats the previous
53 				   REas many times as possible; the value
54 				   ORs with the other operator types */
55 
56 #define ASCSIZ 0200
57 typedef char	TRANSTABLE[ASCSIZ];
58 
59 static	TRANSTABLE trans = {
60 0000,0001,0002,0003,0004,0005,0006,0007,
61 0010,0011,0012,0013,0014,0015,0016,0017,
62 0020,0021,0022,0023,0024,0025,0026,0027,
63 0030,0031,0032,0033,0034,0035,0036,0037,
64 0040,0041,0042,0043,0044,0045,0046,0047,
65 0050,0051,0052,0053,0054,0055,0056,0057,
66 0060,0061,0062,0063,0064,0065,0066,0067,
67 0070,0071,0072,0073,0074,0075,0076,0077,
68 0100,0101,0102,0103,0104,0105,0106,0107,
69 0110,0111,0112,0113,0114,0115,0116,0117,
70 0120,0121,0122,0123,0124,0125,0126,0127,
71 0130,0131,0132,0133,0134,0135,0136,0137,
72 0140,0141,0142,0143,0144,0145,0146,0147,
73 0150,0151,0152,0153,0154,0155,0156,0157,
74 0160,0161,0162,0163,0164,0165,0166,0167,
75 0170,0171,0172,0173,0174,0175,0176,0177,
76 };
77 static bool folding = FALSE;
78 
79 static int err;
80 static char *FirstCharacter;
81 
82 void
search_init()83 search_init()
84 {
85 #ifdef UNDEF
86     register int    i;
87 
88     for (i = 0; i < ASCSIZ; i++)
89 	trans[i] = i;
90 #else
91     ;
92 #endif
93 }
94 
95 void
init_compex(compex)96 init_compex(compex)
97 register COMPEX *compex;
98 {
99     /* the following must start off zeroed */
100 
101     compex->eblen = 0;
102     compex->brastr = Nullch;
103 }
104 
105 void
free_compex(compex)106 free_compex(compex)
107 register COMPEX *compex;
108 {
109     if (compex->eblen) {
110 	free(compex->expbuf);
111 	compex->eblen = 0;
112     }
113     if (compex->brastr) {
114 	free(compex->brastr);
115 	compex->brastr = Nullch;
116     }
117 }
118 
119 static char *gbr_str = Nullch;
120 static int gbr_siz = 0;
121 
122 char *
getbracket(compex,n)123 getbracket(compex,n)
124 register COMPEX *compex;
125 int n;
126 {
127     int length = compex->braelist[n] - compex->braslist[n];
128 
129     if (!compex->nbra)
130 	return Nullch;
131     if (n > compex->nbra || !compex->braelist[n] || length < 0)
132 	return nullstr;
133     growstr(&gbr_str, &gbr_siz, length+1);
134     safecpy(gbr_str, compex->braslist[n], length+1);
135     return gbr_str;
136 }
137 
138 void
case_fold(which)139 case_fold(which)
140 int which;
141 {
142     register int i;
143 
144     if (which != folding) {
145 	if (which) {
146 	    for (i = 'A'; i <= 'Z'; i++)
147 		trans[i] = tolower(i);
148 	}
149 	else {
150 	    for (i = 'A'; i <= 'Z'; i++)
151 		trans[i] = i;
152 	}
153 	folding = which;
154     }
155 }
156 
157 /* Compile the given regular expression into a [secret] internal format */
158 
159 char *
compile(compex,strp,RE,fold)160 compile(compex, strp, RE, fold)
161 register COMPEX *compex;
162 register char   *strp;
163 int RE;
164 int fold;
165 {
166     register int c;
167     register char  *ep;
168     char   *lastep;
169     char    bracket[NBRA],
170 	   *bracketp;
171     char **alt = compex->alternatives;
172     char *retmes = "Badly formed search string";
173 
174     case_fold(compex->do_folding = fold);
175     if (!compex->eblen) {
176 	compex->expbuf = safemalloc(84);
177 	compex->eblen = 80;
178     }
179     ep = compex->expbuf;		/* point at expression buffer */
180     *alt++ = ep;			/* first alternative starts here */
181     bracketp = bracket;			/* first bracket goes here */
182     if (*strp == 0) {			/* nothing to compile? */
183 	if (*ep == 0)			/* nothing there yet? */
184 	    return "Null search string";
185 	return Nullch;			/* just keep old expression */
186     }
187     compex->nbra = 0;			/* no brackets yet */
188     lastep = 0;
189     for (;;) {
190 	if (ep + 4 - compex->expbuf >= compex->eblen)
191 	    ep = grow_eb(compex, ep, alt);
192 	c = *strp++;			/* fetch next char of pattern */
193 	if (c == 0) {			/* end of pattern? */
194 	    if (bracketp != bracket) {	/* balanced brackets? */
195 #ifdef VERBOSE
196 		retmes = "Unbalanced parens";
197 #endif
198 		goto cerror;
199 	    }
200 	    *ep++ = CEND;		/* terminate expression */
201 	    *alt++ = 0;			/* terminal alternative list */
202 	    return Nullch;		/* return success */
203 	}
204 	if (c != '*')
205 	    lastep = ep;
206 	if (!RE) {			/* just a normal search string? */
207 	    *ep++ = CCHR;		/* everything is a normal char */
208 	    *ep++ = c;
209 	}
210 	else				/* it is a regular expression */
211 	    switch (c) {
212 
213 		case '\\':		/* meta something */
214 		    switch (c = *strp++) {
215 		    case '(':
216 			if (compex->nbra >= NBRA) {
217 #ifdef VERBOSE
218 			    retmes = "Too many parens";
219 #endif
220 			    goto cerror;
221 			}
222 			*bracketp++ = ++compex->nbra;
223 			*ep++ = CBRA;
224 			*ep++ = compex->nbra;
225 			break;
226 		    case '|':
227 			if (bracketp>bracket) {
228 #ifdef VERBOSE
229 			    retmes = "No \\| in parens";	/* Alas! */
230 #endif
231 			    goto cerror;
232 			}
233 			*ep++ = CEND;
234 			*alt++ = ep;
235 			break;
236 		    case ')':
237 			if (bracketp <= bracket) {
238 #ifdef VERBOSE
239 			    retmes = "Unmatched right paren";
240 #endif
241 			    goto cerror;
242 			}
243 			*ep++ = CKET;
244 			*ep++ = *--bracketp;
245 			break;
246 		    case 'w':
247 			*ep++ = WORD;
248 			break;
249 		    case 'W':
250 			*ep++ = NWORD;
251 			break;
252 		    case 'b':
253 			*ep++ = WBOUND;
254 			break;
255 		    case 'B':
256 			*ep++ = NWBOUND;
257 			break;
258 		    case '0': case '1': case '2': case '3': case '4':
259 		    case '5': case '6': case '7': case '8': case '9':
260 			*ep++ = CBACK;
261 			*ep++ = c - '0';
262 			break;
263 		    default:
264 			*ep++ = CCHR;
265 			if (c == '\0')
266 			    goto cerror;
267 			*ep++ = c;
268 			break;
269 		    }
270 		    break;
271 		case '.':
272 		    *ep++ = CDOT;
273 		    continue;
274 
275 		case '*':
276 		    if (lastep == 0 || *lastep == CBRA || *lastep == CKET
277 			|| *lastep == CIRC
278 			|| (*lastep&STAR)|| *lastep>NWORD)
279 			goto defchar;
280 		    *lastep |= STAR;
281 		    continue;
282 
283 		case '^':
284 		    if (ep != compex->expbuf && ep[-1] != CEND)
285 			goto defchar;
286 		    *ep++ = CIRC;
287 		    continue;
288 
289 		case '$':
290 		    if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
291 			goto defchar;
292 		    *ep++ = CDOL;
293 		    continue;
294 
295 		case '[': {		/* character class */
296 		    register int i;
297 
298 		    if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
299 			ep = grow_eb(compex, ep, alt); /* reserve bitmap */
300 
301 		    for (i = BMAPSIZ; i; --i)
302 			ep[i] = 0;
303 
304 		    if ((c = *strp++) == '^') {
305 			c = *strp++;
306 			*ep++ = NCCL;	/* negated */
307 		    }
308 		    else
309 			*ep++ = CCL;	/* normal */
310 
311 		    i = 0;		/* remember oldchar */
312 		    do {
313 			if (c == '\0') {
314 #ifdef VERBOSE
315 			    retmes = "Missing ]";
316 #endif
317 			    goto cerror;
318 			}
319 			if (*strp == '-' && *(++strp) != ']' && *strp)
320 			    i = *strp++;
321 			else
322 			    i = c;
323 			while (c <= i) {
324 			    ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
325 			    if (fold && isalpha(c))
326 				ep[(c ^ 32) / BITSPERBYTE] |=
327 				    1 << ((c ^ 32) % BITSPERBYTE);
328 					/* set the other bit too */
329 			    c++;
330 			}
331 		    } while ((c = *strp++) != ']');
332 		    ep += BMAPSIZ;
333 		    continue;
334 		}
335 
336 	    defchar:
337 		default:
338 		    *ep++ = CCHR;
339 		    *ep++ = c;
340 	    }
341     }
342 cerror:
343     compex->expbuf[0] = 0;
344     compex->nbra = 0;
345     return retmes;
346 }
347 
348 char *
grow_eb(compex,epp,alt)349 grow_eb(compex, epp, alt)
350 register COMPEX *compex;
351 char *epp;
352 char **alt;
353 {
354     register char *oldbuf = compex->expbuf;
355     register char **altlist = compex->alternatives;
356 
357     compex->eblen += 80;
358     compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
359     if (compex->expbuf != oldbuf) {	/* realloc can change expbuf! */
360 	epp += compex->expbuf - oldbuf;
361 	while (altlist != alt)
362 	    *altlist++ += compex->expbuf - oldbuf;
363     }
364     return epp;
365 }
366 
367 char *
execute(compex,addr)368 execute(compex, addr)
369 register COMPEX *compex;
370 char *addr;
371 {
372     register char *p1 = addr;
373     register char *trt = trans;
374     register int c;
375 
376     if (addr == Nullch || compex->expbuf == Nullch)
377 	return Nullch;
378     if (compex->nbra) {			/* any brackets? */
379 	for (c = 0; c <= compex->nbra; c++)
380 	    compex->braslist[c] = compex->braelist[c] = Nullch;
381 	if (compex->brastr)
382 	    free(compex->brastr);
383 	compex->brastr = savestr(p1);	/* in case p1 is not static */
384 	p1 = compex->brastr;		/* ! */
385     }
386     case_fold(compex->do_folding);	/* make sure table is correct */
387     FirstCharacter = p1;		/* for ^ tests */
388     if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
389 	c = trt[compex->expbuf[1]];	/* fast check for first character */
390 	do {
391 	    if (trt[*p1] == c && advance(compex, p1, compex->expbuf))
392 		return p1;
393 	    p1++;
394 	} while (*p1 && !err);
395 	if (err) err = 0;
396 	return Nullch;
397     }
398     else {			/* regular algorithm */
399 	do {
400 	    register char **alt = compex->alternatives;
401 	    while (*alt) {
402 		if (advance(compex, p1, *alt++))
403 		    return p1;
404 	    }
405 	    p1++;
406 	} while (*p1 && !err);
407 	if (err) err = 0;
408 	return Nullch;
409     }
410    /*NOTREACHED*/
411 }
412 
413 /* advance the match of the regular expression starting at ep along the
414    string lp, simulates an NDFSA */
415 bool
advance(compex,lp,ep)416 advance(compex, lp, ep)
417 register COMPEX *compex;
418 register char *ep;
419 register char *lp;
420 {
421     register char *curlp;
422     register char *trt = trans;
423     register int i;
424 
425     while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
426 	switch (*ep++) {
427 
428 	    case CCHR:
429 		if (trt[*ep++] != trt[*lp]) return FALSE;
430 		lp++;
431 		continue;
432 
433 	    case CDOT:
434 		if (*lp == '\n') return FALSE;
435 		lp++;
436 		continue;
437 
438 	    case CDOL:
439 		if (!*lp || *lp == '\n')
440 		    continue;
441 		return FALSE;
442 
443 	    case CIRC:
444 		if (lp == FirstCharacter || lp[-1]=='\n')
445 		    continue;
446 		return FALSE;
447 
448 	    case WORD:
449 		if (isalnum(*lp)) {
450 		    lp++;
451 		    continue;
452 		}
453 		return FALSE;
454 
455 	    case NWORD:
456 		if (!isalnum(*lp)) {
457 		    lp++;
458 		    continue;
459 		}
460 		return FALSE;
461 
462 	    case WBOUND:
463 		if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
464 			(!*lp || !isalnum(*lp)) )
465 		    continue;
466 		return FALSE;
467 
468 	    case NWBOUND:
469 		if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
470 			(!*lp || !isalnum(*lp)))
471 		    continue;
472 		return FALSE;
473 
474 	    case CEND:
475 		return TRUE;
476 
477 	    case CCL:
478 		if (cclass(ep, *lp, 1)) {
479 		    ep += BMAPSIZ;
480 		    lp++;
481 		    continue;
482 		}
483 		return FALSE;
484 
485 	    case NCCL:
486 		if (cclass(ep, *lp, 0)) {
487 		    ep += BMAPSIZ;
488 		    lp++;
489 		    continue;
490 		}
491 		return FALSE;
492 
493 	    case CBRA:
494 		compex->braslist[*ep++] = lp;
495 		continue;
496 
497 	    case CKET:
498 		i = *ep++;
499 		compex->braelist[i] = lp;
500 		compex->braelist[0] = lp;
501 		compex->braslist[0] = compex->braslist[i];
502 		continue;
503 
504 	    case CBACK:
505 		if (compex->braelist[i = *ep++] == 0) {
506 		    fputs("bad braces\n",stdout) FLUSH;
507 		    err = TRUE;
508 		    return FALSE;
509 		}
510 		if (backref(compex, i, lp)) {
511 		    lp += compex->braelist[i] - compex->braslist[i];
512 		    continue;
513 		}
514 		return FALSE;
515 
516 	    case CBACK | STAR:
517 		if (compex->braelist[i = *ep++] == 0) {
518 		    fputs("bad braces\n",stdout) FLUSH;
519 		    err = TRUE;
520 		    return FALSE;
521 		}
522 		curlp = lp;
523 		while (backref(compex, i, lp)) {
524 		    lp += compex->braelist[i] - compex->braslist[i];
525 		}
526 		while (lp >= curlp) {
527 		    if (advance(compex, lp, ep))
528 			return TRUE;
529 		    lp -= compex->braelist[i] - compex->braslist[i];
530 		}
531 		continue;
532 
533 	    case CDOT | STAR:
534 		curlp = lp;
535 		while (*lp++ && lp[-1] != '\n');
536 		goto star;
537 
538 	    case WORD | STAR:
539 		curlp = lp;
540 		while (*lp++ && isalnum(lp[-1]));
541 		goto star;
542 
543 	    case NWORD | STAR:
544 		curlp = lp;
545 		while (*lp++ && !isalnum(lp[-1]));
546 		goto star;
547 
548 	    case CCHR | STAR:
549 		curlp = lp;
550 		while (*lp++ && trt[lp[-1]] == trt[*ep]);
551 		ep++;
552 		goto star;
553 
554 	    case CCL | STAR:
555 	    case NCCL | STAR:
556 		curlp = lp;
557 		while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR)));
558 		ep += BMAPSIZ;
559 		goto star;
560 
561 	star:
562 		do {
563 		    lp--;
564 		    if (advance(compex, lp, ep))
565 			return TRUE;
566 		} while (lp > curlp);
567 		return FALSE;
568 
569 	    default:
570 		fputs("Badly compiled pattern\n",stdout) FLUSH;
571 		err = TRUE;
572 		return -1;
573 	}
574     if (*ep == CEND || *ep == CDOL || *ep == WBOUND) {
575 	return TRUE;
576     }
577     return FALSE;
578 }
579 
580 bool
backref(compex,i,lp)581 backref(compex, i, lp)
582 register COMPEX *compex;
583 register int i;
584 register char *lp;
585 {
586     register char *bp;
587 
588     bp = compex->braslist[i];
589     while (*lp && *bp == *lp) {
590 	bp++;
591 	lp++;
592 	if (bp >= compex->braelist[i])
593 	    return TRUE;
594     }
595     return FALSE;
596 }
597 
598 bool
cclass(set,c,af)599 cclass(set, c, af)
600 register char  *set;
601 register int c;
602 int af;
603 {
604     c &= 0177;
605 #if BITSPERBYTE == 8
606     if (set[c >> 3] & 1 << (c & 7))
607 #else
608     if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
609 #endif
610 	return af;
611     return !af;
612 }
613