1 /* $Id: search.c,v 3.0 1992/02/01 03:09:32 davison Trn $
2 */
3
4 /* string search routines */
5
6 /* Copyright (c) 1981,1980 James Gosling */
7
8 /* Modified Aug. 12, 1981 by Tom London to include regular expressions
9 as in ed. RE stuff hacked over by jag to correct a few major problems,
10 mainly dealing with searching within the buffer rather than copying
11 each line to a separate array. Newlines can now appear in RE's */
12
13 /* Ripped to shreds and glued back together to make a search package,
14 * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
15 * Changes include:
16 * Buffer, window, and mlisp stuff gone.
17 * Translation tables reduced to 1 table.
18 * Expression buffer is now dynamically allocated.
19 * Character classes now implemented with a bitmap.
20 */
21
22 #include "EXTERN.h"
23 #include "common.h"
24 #include "util.h"
25 #include "INTERN.h"
26 #include "search.h"
27
28 #ifndef BITSPERBYTE
29 #define BITSPERBYTE 8
30 #endif
31
32 #define BMAPSIZ (127 / BITSPERBYTE + 1)
33
34 /* meta characters in the "compiled" form of a regular expression */
35 #define CBRA 2 /* \( -- begin bracket */
36 #define CCHR 4 /* a vanilla character */
37 #define CDOT 6 /* . -- match anything except a newline */
38 #define CCL 8 /* [...] -- character class */
39 #define NCCL 10 /* [^...] -- negated character class */
40 #define CDOL 12 /* $ -- matches the end of a line */
41 #define CEND 14 /* The end of the pattern */
42 #define CKET 16 /* \) -- close bracket */
43 #define CBACK 18 /* \N -- backreference to the Nth bracketed
44 string */
45 #define CIRC 20 /* ^ matches the beginning of a line */
46
47 #define WORD 32 /* matches word character \w */
48 #define NWORD 34 /* matches non-word characer \W */
49 #define WBOUND 36 /* matches word boundary \b */
50 #define NWBOUND 38 /* matches non-(word boundary) \B */
51
52 #define STAR 01 /* * -- Kleene star, repeats the previous
53 REas many times as possible; the value
54 ORs with the other operator types */
55
56 #define ASCSIZ 0200
57 typedef char TRANSTABLE[ASCSIZ];
58
59 static TRANSTABLE trans = {
60 0000,0001,0002,0003,0004,0005,0006,0007,
61 0010,0011,0012,0013,0014,0015,0016,0017,
62 0020,0021,0022,0023,0024,0025,0026,0027,
63 0030,0031,0032,0033,0034,0035,0036,0037,
64 0040,0041,0042,0043,0044,0045,0046,0047,
65 0050,0051,0052,0053,0054,0055,0056,0057,
66 0060,0061,0062,0063,0064,0065,0066,0067,
67 0070,0071,0072,0073,0074,0075,0076,0077,
68 0100,0101,0102,0103,0104,0105,0106,0107,
69 0110,0111,0112,0113,0114,0115,0116,0117,
70 0120,0121,0122,0123,0124,0125,0126,0127,
71 0130,0131,0132,0133,0134,0135,0136,0137,
72 0140,0141,0142,0143,0144,0145,0146,0147,
73 0150,0151,0152,0153,0154,0155,0156,0157,
74 0160,0161,0162,0163,0164,0165,0166,0167,
75 0170,0171,0172,0173,0174,0175,0176,0177,
76 };
77 static bool folding = FALSE;
78
79 static int err;
80 static char *FirstCharacter;
81
82 void
search_init()83 search_init()
84 {
85 #ifdef UNDEF
86 register int i;
87
88 for (i = 0; i < ASCSIZ; i++)
89 trans[i] = i;
90 #else
91 ;
92 #endif
93 }
94
95 void
init_compex(compex)96 init_compex(compex)
97 register COMPEX *compex;
98 {
99 /* the following must start off zeroed */
100
101 compex->eblen = 0;
102 compex->brastr = Nullch;
103 }
104
105 void
free_compex(compex)106 free_compex(compex)
107 register COMPEX *compex;
108 {
109 if (compex->eblen) {
110 free(compex->expbuf);
111 compex->eblen = 0;
112 }
113 if (compex->brastr) {
114 free(compex->brastr);
115 compex->brastr = Nullch;
116 }
117 }
118
119 static char *gbr_str = Nullch;
120 static int gbr_siz = 0;
121
122 char *
getbracket(compex,n)123 getbracket(compex,n)
124 register COMPEX *compex;
125 int n;
126 {
127 int length = compex->braelist[n] - compex->braslist[n];
128
129 if (!compex->nbra)
130 return Nullch;
131 if (n > compex->nbra || !compex->braelist[n] || length < 0)
132 return nullstr;
133 growstr(&gbr_str, &gbr_siz, length+1);
134 safecpy(gbr_str, compex->braslist[n], length+1);
135 return gbr_str;
136 }
137
138 void
case_fold(which)139 case_fold(which)
140 int which;
141 {
142 register int i;
143
144 if (which != folding) {
145 if (which) {
146 for (i = 'A'; i <= 'Z'; i++)
147 trans[i] = tolower(i);
148 }
149 else {
150 for (i = 'A'; i <= 'Z'; i++)
151 trans[i] = i;
152 }
153 folding = which;
154 }
155 }
156
157 /* Compile the given regular expression into a [secret] internal format */
158
159 char *
compile(compex,strp,RE,fold)160 compile(compex, strp, RE, fold)
161 register COMPEX *compex;
162 register char *strp;
163 int RE;
164 int fold;
165 {
166 register int c;
167 register char *ep;
168 char *lastep;
169 char bracket[NBRA],
170 *bracketp;
171 char **alt = compex->alternatives;
172 char *retmes = "Badly formed search string";
173
174 case_fold(compex->do_folding = fold);
175 if (!compex->eblen) {
176 compex->expbuf = safemalloc(84);
177 compex->eblen = 80;
178 }
179 ep = compex->expbuf; /* point at expression buffer */
180 *alt++ = ep; /* first alternative starts here */
181 bracketp = bracket; /* first bracket goes here */
182 if (*strp == 0) { /* nothing to compile? */
183 if (*ep == 0) /* nothing there yet? */
184 return "Null search string";
185 return Nullch; /* just keep old expression */
186 }
187 compex->nbra = 0; /* no brackets yet */
188 lastep = 0;
189 for (;;) {
190 if (ep + 4 - compex->expbuf >= compex->eblen)
191 ep = grow_eb(compex, ep, alt);
192 c = *strp++; /* fetch next char of pattern */
193 if (c == 0) { /* end of pattern? */
194 if (bracketp != bracket) { /* balanced brackets? */
195 #ifdef VERBOSE
196 retmes = "Unbalanced parens";
197 #endif
198 goto cerror;
199 }
200 *ep++ = CEND; /* terminate expression */
201 *alt++ = 0; /* terminal alternative list */
202 return Nullch; /* return success */
203 }
204 if (c != '*')
205 lastep = ep;
206 if (!RE) { /* just a normal search string? */
207 *ep++ = CCHR; /* everything is a normal char */
208 *ep++ = c;
209 }
210 else /* it is a regular expression */
211 switch (c) {
212
213 case '\\': /* meta something */
214 switch (c = *strp++) {
215 case '(':
216 if (compex->nbra >= NBRA) {
217 #ifdef VERBOSE
218 retmes = "Too many parens";
219 #endif
220 goto cerror;
221 }
222 *bracketp++ = ++compex->nbra;
223 *ep++ = CBRA;
224 *ep++ = compex->nbra;
225 break;
226 case '|':
227 if (bracketp>bracket) {
228 #ifdef VERBOSE
229 retmes = "No \\| in parens"; /* Alas! */
230 #endif
231 goto cerror;
232 }
233 *ep++ = CEND;
234 *alt++ = ep;
235 break;
236 case ')':
237 if (bracketp <= bracket) {
238 #ifdef VERBOSE
239 retmes = "Unmatched right paren";
240 #endif
241 goto cerror;
242 }
243 *ep++ = CKET;
244 *ep++ = *--bracketp;
245 break;
246 case 'w':
247 *ep++ = WORD;
248 break;
249 case 'W':
250 *ep++ = NWORD;
251 break;
252 case 'b':
253 *ep++ = WBOUND;
254 break;
255 case 'B':
256 *ep++ = NWBOUND;
257 break;
258 case '0': case '1': case '2': case '3': case '4':
259 case '5': case '6': case '7': case '8': case '9':
260 *ep++ = CBACK;
261 *ep++ = c - '0';
262 break;
263 default:
264 *ep++ = CCHR;
265 if (c == '\0')
266 goto cerror;
267 *ep++ = c;
268 break;
269 }
270 break;
271 case '.':
272 *ep++ = CDOT;
273 continue;
274
275 case '*':
276 if (lastep == 0 || *lastep == CBRA || *lastep == CKET
277 || *lastep == CIRC
278 || (*lastep&STAR)|| *lastep>NWORD)
279 goto defchar;
280 *lastep |= STAR;
281 continue;
282
283 case '^':
284 if (ep != compex->expbuf && ep[-1] != CEND)
285 goto defchar;
286 *ep++ = CIRC;
287 continue;
288
289 case '$':
290 if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
291 goto defchar;
292 *ep++ = CDOL;
293 continue;
294
295 case '[': { /* character class */
296 register int i;
297
298 if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
299 ep = grow_eb(compex, ep, alt); /* reserve bitmap */
300
301 for (i = BMAPSIZ; i; --i)
302 ep[i] = 0;
303
304 if ((c = *strp++) == '^') {
305 c = *strp++;
306 *ep++ = NCCL; /* negated */
307 }
308 else
309 *ep++ = CCL; /* normal */
310
311 i = 0; /* remember oldchar */
312 do {
313 if (c == '\0') {
314 #ifdef VERBOSE
315 retmes = "Missing ]";
316 #endif
317 goto cerror;
318 }
319 if (*strp == '-' && *(++strp) != ']' && *strp)
320 i = *strp++;
321 else
322 i = c;
323 while (c <= i) {
324 ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
325 if (fold && isalpha(c))
326 ep[(c ^ 32) / BITSPERBYTE] |=
327 1 << ((c ^ 32) % BITSPERBYTE);
328 /* set the other bit too */
329 c++;
330 }
331 } while ((c = *strp++) != ']');
332 ep += BMAPSIZ;
333 continue;
334 }
335
336 defchar:
337 default:
338 *ep++ = CCHR;
339 *ep++ = c;
340 }
341 }
342 cerror:
343 compex->expbuf[0] = 0;
344 compex->nbra = 0;
345 return retmes;
346 }
347
348 char *
grow_eb(compex,epp,alt)349 grow_eb(compex, epp, alt)
350 register COMPEX *compex;
351 char *epp;
352 char **alt;
353 {
354 register char *oldbuf = compex->expbuf;
355 register char **altlist = compex->alternatives;
356
357 compex->eblen += 80;
358 compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
359 if (compex->expbuf != oldbuf) { /* realloc can change expbuf! */
360 epp += compex->expbuf - oldbuf;
361 while (altlist != alt)
362 *altlist++ += compex->expbuf - oldbuf;
363 }
364 return epp;
365 }
366
367 char *
execute(compex,addr)368 execute(compex, addr)
369 register COMPEX *compex;
370 char *addr;
371 {
372 register char *p1 = addr;
373 register char *trt = trans;
374 register int c;
375
376 if (addr == Nullch || compex->expbuf == Nullch)
377 return Nullch;
378 if (compex->nbra) { /* any brackets? */
379 for (c = 0; c <= compex->nbra; c++)
380 compex->braslist[c] = compex->braelist[c] = Nullch;
381 if (compex->brastr)
382 free(compex->brastr);
383 compex->brastr = savestr(p1); /* in case p1 is not static */
384 p1 = compex->brastr; /* ! */
385 }
386 case_fold(compex->do_folding); /* make sure table is correct */
387 FirstCharacter = p1; /* for ^ tests */
388 if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
389 c = trt[compex->expbuf[1]]; /* fast check for first character */
390 do {
391 if (trt[*p1] == c && advance(compex, p1, compex->expbuf))
392 return p1;
393 p1++;
394 } while (*p1 && !err);
395 if (err) err = 0;
396 return Nullch;
397 }
398 else { /* regular algorithm */
399 do {
400 register char **alt = compex->alternatives;
401 while (*alt) {
402 if (advance(compex, p1, *alt++))
403 return p1;
404 }
405 p1++;
406 } while (*p1 && !err);
407 if (err) err = 0;
408 return Nullch;
409 }
410 /*NOTREACHED*/
411 }
412
413 /* advance the match of the regular expression starting at ep along the
414 string lp, simulates an NDFSA */
415 bool
advance(compex,lp,ep)416 advance(compex, lp, ep)
417 register COMPEX *compex;
418 register char *ep;
419 register char *lp;
420 {
421 register char *curlp;
422 register char *trt = trans;
423 register int i;
424
425 while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
426 switch (*ep++) {
427
428 case CCHR:
429 if (trt[*ep++] != trt[*lp]) return FALSE;
430 lp++;
431 continue;
432
433 case CDOT:
434 if (*lp == '\n') return FALSE;
435 lp++;
436 continue;
437
438 case CDOL:
439 if (!*lp || *lp == '\n')
440 continue;
441 return FALSE;
442
443 case CIRC:
444 if (lp == FirstCharacter || lp[-1]=='\n')
445 continue;
446 return FALSE;
447
448 case WORD:
449 if (isalnum(*lp)) {
450 lp++;
451 continue;
452 }
453 return FALSE;
454
455 case NWORD:
456 if (!isalnum(*lp)) {
457 lp++;
458 continue;
459 }
460 return FALSE;
461
462 case WBOUND:
463 if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
464 (!*lp || !isalnum(*lp)) )
465 continue;
466 return FALSE;
467
468 case NWBOUND:
469 if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
470 (!*lp || !isalnum(*lp)))
471 continue;
472 return FALSE;
473
474 case CEND:
475 return TRUE;
476
477 case CCL:
478 if (cclass(ep, *lp, 1)) {
479 ep += BMAPSIZ;
480 lp++;
481 continue;
482 }
483 return FALSE;
484
485 case NCCL:
486 if (cclass(ep, *lp, 0)) {
487 ep += BMAPSIZ;
488 lp++;
489 continue;
490 }
491 return FALSE;
492
493 case CBRA:
494 compex->braslist[*ep++] = lp;
495 continue;
496
497 case CKET:
498 i = *ep++;
499 compex->braelist[i] = lp;
500 compex->braelist[0] = lp;
501 compex->braslist[0] = compex->braslist[i];
502 continue;
503
504 case CBACK:
505 if (compex->braelist[i = *ep++] == 0) {
506 fputs("bad braces\n",stdout) FLUSH;
507 err = TRUE;
508 return FALSE;
509 }
510 if (backref(compex, i, lp)) {
511 lp += compex->braelist[i] - compex->braslist[i];
512 continue;
513 }
514 return FALSE;
515
516 case CBACK | STAR:
517 if (compex->braelist[i = *ep++] == 0) {
518 fputs("bad braces\n",stdout) FLUSH;
519 err = TRUE;
520 return FALSE;
521 }
522 curlp = lp;
523 while (backref(compex, i, lp)) {
524 lp += compex->braelist[i] - compex->braslist[i];
525 }
526 while (lp >= curlp) {
527 if (advance(compex, lp, ep))
528 return TRUE;
529 lp -= compex->braelist[i] - compex->braslist[i];
530 }
531 continue;
532
533 case CDOT | STAR:
534 curlp = lp;
535 while (*lp++ && lp[-1] != '\n');
536 goto star;
537
538 case WORD | STAR:
539 curlp = lp;
540 while (*lp++ && isalnum(lp[-1]));
541 goto star;
542
543 case NWORD | STAR:
544 curlp = lp;
545 while (*lp++ && !isalnum(lp[-1]));
546 goto star;
547
548 case CCHR | STAR:
549 curlp = lp;
550 while (*lp++ && trt[lp[-1]] == trt[*ep]);
551 ep++;
552 goto star;
553
554 case CCL | STAR:
555 case NCCL | STAR:
556 curlp = lp;
557 while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR)));
558 ep += BMAPSIZ;
559 goto star;
560
561 star:
562 do {
563 lp--;
564 if (advance(compex, lp, ep))
565 return TRUE;
566 } while (lp > curlp);
567 return FALSE;
568
569 default:
570 fputs("Badly compiled pattern\n",stdout) FLUSH;
571 err = TRUE;
572 return -1;
573 }
574 if (*ep == CEND || *ep == CDOL || *ep == WBOUND) {
575 return TRUE;
576 }
577 return FALSE;
578 }
579
580 bool
backref(compex,i,lp)581 backref(compex, i, lp)
582 register COMPEX *compex;
583 register int i;
584 register char *lp;
585 {
586 register char *bp;
587
588 bp = compex->braslist[i];
589 while (*lp && *bp == *lp) {
590 bp++;
591 lp++;
592 if (bp >= compex->braelist[i])
593 return TRUE;
594 }
595 return FALSE;
596 }
597
598 bool
cclass(set,c,af)599 cclass(set, c, af)
600 register char *set;
601 register int c;
602 int af;
603 {
604 c &= 0177;
605 #if BITSPERBYTE == 8
606 if (set[c >> 3] & 1 << (c & 7))
607 #else
608 if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
609 #endif
610 return af;
611 return !af;
612 }
613