1 /*
2 * Copyright (c) 1992-1998 Michael A. Cooper.
3 * This software may be freely used and distributed provided it is not
4 * sold for profit or used in part or in whole for commercial gain
5 * without prior written agreement, and the author is credited
6 * appropriately.
7 */
8 /*
9 * Copyright (c) 1983 Regents of the University of California.
10 * All rights reserved.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #if !defined(lint)
42 static char RCSid[] =
43 "$Id: regex.c,v 6.3 1998/11/10 04:14:28 mcooper Exp $";
44
45 static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) 3/9/86";
46
47 static char copyright[] =
48 "Copyright (c) 1992-1998 Michael A. Cooper.\n\
49 @(#) Copyright (c) 1983 Regents of the University of California.\n\
50 All rights reserved.\n";
51 #endif /* not lint */
52
53 /*
54 * routines to do regular expression matching
55 *
56 * Entry points:
57 *
58 * re_comp(s)
59 * char *s;
60 * ... returns 0 if the string s was compiled successfully,
61 * a pointer to an error message otherwise.
62 * If passed 0 or a null string returns without changing
63 * the currently compiled re (see note 11 below).
64 *
65 * re_exec(s)
66 * char *s;
67 * ... returns 1 if the string s matches the last compiled regular
68 * expression,
69 * 0 if the string s failed to match the last compiled
70 * regular expression, and
71 * -1 if the compiled regular expression was invalid
72 * (indicating an internal error).
73 *
74 * The strings passed to both re_comp and re_exec may have trailing or
75 * embedded newline characters; they are terminated by nulls.
76 *
77 * The identity of the author of these routines is lost in antiquity;
78 * this is essentially the same as the re code in the original V6 ed.
79 *
80 * The regular expressions recognized are described below. This description
81 * is essentially the same as that for ed.
82 *
83 * A regular expression specifies a set of strings of characters.
84 * A member of this set of strings is said to be matched by
85 * the regular expression. In the following specification for
86 * regular expressions the word `character' means any character but NUL.
87 *
88 * 1. Any character except a special character matches itself.
89 * Special characters are the regular expression delimiter plus
90 * \ [ . and sometimes ^ * $.
91 * 2. A . matches any character.
92 * 3. A \ followed by any character except a digit or ( )
93 * matches that character.
94 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
95 * character in (or not in) s. In s, \ has no special meaning,
96 * and ] may only appear as the first letter. A substring
97 * a-b, with a and b in ascending ASCII order, stands for
98 * the inclusive range of ASCII characters.
99 * 5. A regular expression of form 1-4 followed by * matches a
100 * sequence of 0 or more matches of the regular expression.
101 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
102 * matches what x matches.
103 * 7. A \ followed by a digit n matches a copy of the string that the
104 * bracketed regular expression beginning with the nth \( matched.
105 * 8. A regular expression of form 1-8, x, followed by a regular
106 * expression of form 1-7, y matches a match for x followed by
107 * a match for y, with the x match being as long as possible
108 * while still permitting a y match.
109 * 9. A regular expression of form 1-8 preceded by ^ (or followed
110 * by $), is constrained to matches that begin at the left
111 * (or end at the right) end of a line.
112 * 10. A regular expression of form 1-9 picks out the longest among
113 * the leftmost matches in a line.
114 * 11. An empty regular expression stands for a copy of the last
115 * regular expression encountered.
116 */
117
118 /*
119 * constants for re's
120 */
121 #define CBRA 1
122 #define CCHR 2
123 #define CDOT 4
124 #define CCL 6
125 #define NCCL 8
126 #define CDOL 10
127 #define CEOF 11
128 #define CKET 12
129 #define CBACK 18
130
131 #define CSTAR 01
132
133 #define ESIZE 512
134 #define NBRA 9
135
136 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
137 static char circf;
138 static int advance();
139
140 /*
141 * compile the regular expression argument into a dfa
142 */
143 char *
re_comp(sp)144 re_comp(sp)
145 register char *sp;
146 {
147 register int c;
148 register char *ep = expbuf;
149 int cclcnt, numbra = 0;
150 char *lastep = 0;
151 char bracket[NBRA];
152 char *bracketp = &bracket[0];
153 static char *retoolong = "Regular expression too long";
154
155 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
156
157 if (sp == 0 || *sp == '\0') {
158 if (*ep == 0)
159 return("No previous regular expression");
160 return(0);
161 }
162 if (*sp == '^') {
163 circf = 1;
164 sp++;
165 }
166 else
167 circf = 0;
168 for (;;) {
169 if (ep >= &expbuf[ESIZE])
170 comerr(retoolong);
171 if ((c = *sp++) == '\0') {
172 if (bracketp != bracket)
173 comerr("unmatched \\(");
174 *ep++ = CEOF;
175 *ep++ = 0;
176 return(0);
177 }
178 if (c != '*')
179 lastep = ep;
180 switch (c) {
181
182 case '.':
183 *ep++ = CDOT;
184 continue;
185
186 case '*':
187 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
188 goto defchar;
189 *lastep |= CSTAR;
190 continue;
191
192 case '$':
193 if (*sp != '\0')
194 goto defchar;
195 *ep++ = CDOL;
196 continue;
197
198 case '[':
199 *ep++ = CCL;
200 *ep++ = 0;
201 cclcnt = 1;
202 if ((c = *sp++) == '^') {
203 c = *sp++;
204 ep[-2] = NCCL;
205 }
206 do {
207 if (c == '\0')
208 comerr("missing ]");
209 if (c == '-' && ep [-1] != 0) {
210 if ((c = *sp++) == ']') {
211 *ep++ = '-';
212 cclcnt++;
213 break;
214 }
215 while (ep[-1] < c) {
216 *ep = ep[-1] + 1;
217 ep++;
218 cclcnt++;
219 if (ep >= &expbuf[ESIZE])
220 comerr(retoolong);
221 }
222 }
223 *ep++ = c;
224 cclcnt++;
225 if (ep >= &expbuf[ESIZE])
226 comerr(retoolong);
227 } while ((c = *sp++) != ']');
228 lastep[1] = cclcnt;
229 continue;
230
231 case '\\':
232 if ((c = *sp++) == '(') {
233 if (numbra >= NBRA)
234 comerr("too many \\(\\) pairs");
235 *bracketp++ = numbra;
236 *ep++ = CBRA;
237 *ep++ = numbra++;
238 continue;
239 }
240 if (c == ')') {
241 if (bracketp <= bracket)
242 comerr("unmatched \\)");
243 *ep++ = CKET;
244 *ep++ = *--bracketp;
245 continue;
246 }
247 if (c >= '1' && c < ('1' + NBRA)) {
248 *ep++ = CBACK;
249 *ep++ = c - '1';
250 continue;
251 }
252 *ep++ = CCHR;
253 *ep++ = c;
254 continue;
255
256 defchar:
257 default:
258 *ep++ = CCHR;
259 *ep++ = c;
260 }
261 }
262 }
263
264 /*
265 * match the argument string against the compiled re
266 */
267 int
re_exec(p1)268 re_exec(p1)
269 register char *p1;
270 {
271 register char *p2 = expbuf;
272 register int c;
273 int rv;
274
275 for (c = 0; c < NBRA; c++) {
276 braslist[c] = 0;
277 braelist[c] = 0;
278 }
279 if (circf)
280 return((advance(p1, p2)));
281 /*
282 * fast check for first character
283 */
284 if (*p2 == CCHR) {
285 c = p2[1];
286 do {
287 if (*p1 != c)
288 continue;
289 if (rv = advance(p1, p2))
290 return(rv);
291 } while (*p1++);
292 return(0);
293 }
294 /*
295 * regular algorithm
296 */
297 do
298 if (rv = advance(p1, p2))
299 return(rv);
300 while (*p1++);
301 return(0);
302 }
303
304 /*
305 * try to match the next thing in the dfa
306 */
307 static int
advance(lp,ep)308 advance(lp, ep)
309 register char *lp, *ep;
310 {
311 register char *curlp;
312 int ct, i;
313 int rv;
314
315 for (;;)
316 switch (*ep++) {
317
318 case CCHR:
319 if (*ep++ == *lp++)
320 continue;
321 return(0);
322
323 case CDOT:
324 if (*lp++)
325 continue;
326 return(0);
327
328 case CDOL:
329 if (*lp == '\0')
330 continue;
331 return(0);
332
333 case CEOF:
334 return(1);
335
336 case CCL:
337 if (cclass(ep, *lp++, 1)) {
338 ep += *ep;
339 continue;
340 }
341 return(0);
342
343 case NCCL:
344 if (cclass(ep, *lp++, 0)) {
345 ep += *ep;
346 continue;
347 }
348 return(0);
349
350 case CBRA:
351 braslist[*ep++] = lp;
352 continue;
353
354 case CKET:
355 braelist[*ep++] = lp;
356 continue;
357
358 case CBACK:
359 if (braelist[i = *ep++] == 0)
360 return(-1);
361 if (backref(i, lp)) {
362 lp += braelist[i] - braslist[i];
363 continue;
364 }
365 return(0);
366
367 case CBACK|CSTAR:
368 if (braelist[i = *ep++] == 0)
369 return(-1);
370 curlp = lp;
371 ct = braelist[i] - braslist[i];
372 while (backref(i, lp))
373 lp += ct;
374 while (lp >= curlp) {
375 if (rv = advance(lp, ep))
376 return(rv);
377 lp -= ct;
378 }
379 continue;
380
381 case CDOT|CSTAR:
382 curlp = lp;
383 while (*lp++)
384 ;
385 goto star;
386
387 case CCHR|CSTAR:
388 curlp = lp;
389 while (*lp++ == *ep)
390 ;
391 ep++;
392 goto star;
393
394 case CCL|CSTAR:
395 case NCCL|CSTAR:
396 curlp = lp;
397 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
398 ;
399 ep += *ep;
400 goto star;
401
402 star:
403 do {
404 lp--;
405 if (rv = advance(lp, ep))
406 return(rv);
407 } while (lp > curlp);
408 return(0);
409
410 default:
411 return(-1);
412 }
413 }
414
backref(i,lp)415 backref(i, lp)
416 register int i;
417 register char *lp;
418 {
419 register char *bp;
420
421 bp = braslist[i];
422 while (*bp++ == *lp++)
423 if (bp >= braelist[i])
424 return(1);
425 return(0);
426 }
427
428 int
cclass(set,c,af)429 cclass(set, c, af)
430 register char *set, c;
431 int af;
432 {
433 register int n;
434
435 if (c == 0)
436 return(0);
437 n = *set++;
438 while (--n)
439 if (*set++ == c)
440 return(af);
441 return(! af);
442 }
443