1 /*
2 * lexical analyzer
3 * This file is #included by regcomp.c.
4 *
5 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 *
7 * Development of this software was funded, in part, by Cray Research Inc.,
8 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 * Corporation, none of whom are responsible for the results. The author
10 * thanks all of them.
11 *
12 * Redistribution and use in source and binary forms -- with or without
13 * modification -- are permitted for any purpose, provided that
14 * redistributions in source form retain this entire copyright notice and
15 * indicate the origin and nature of any modifications.
16 *
17 * I'd appreciate being given credit for this package in the documentation
18 * of software which uses it, but that is not a requirement.
19 *
20 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * src/backend/regex/regc_lex.c
32 *
33 */
34
35 /* scanning macros (know about v) */
36 #define ATEOS() (v->now >= v->stop)
37 #define HAVE(n) (v->stop - v->now >= (n))
38 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \
41 *(v->now+1) == CHR(b) && \
42 *(v->now+2) == CHR(c))
43 #define SET(c) (v->nexttype = (c))
44 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c) return (SET(c), 1)
46 #define RETV(c, n) return (SETV(c, n), 1)
47 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
49
50 /* lexical contexts */
51 #define L_ERE 1 /* mainline ERE/ARE */
52 #define L_BRE 2 /* mainline BRE */
53 #define L_Q 3 /* REG_QUOTE */
54 #define L_EBND 4 /* ERE/ARE bound */
55 #define L_BBND 5 /* BRE bound */
56 #define L_BRACK 6 /* brackets */
57 #define L_CEL 7 /* collating element */
58 #define L_ECL 8 /* equivalence class */
59 #define L_CCL 9 /* character class */
60 #define INTOCON(c) (v->lexcon = (c))
61 #define INCON(con) (v->lexcon == (con))
62
63 /* construct pointer past end of chr array */
64 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
65
66 /*
67 * lexstart - set up lexical stuff, scan leading options
68 */
69 static void
lexstart(struct vars * v)70 lexstart(struct vars *v)
71 {
72 prefixes(v); /* may turn on new type bits etc. */
73 NOERR();
74
75 if (v->cflags & REG_QUOTE)
76 {
77 assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
78 INTOCON(L_Q);
79 }
80 else if (v->cflags & REG_EXTENDED)
81 {
82 assert(!(v->cflags & REG_QUOTE));
83 INTOCON(L_ERE);
84 }
85 else
86 {
87 assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88 INTOCON(L_BRE);
89 }
90
91 v->nexttype = EMPTY; /* remember we were at the start */
92 next(v); /* set up the first token */
93 }
94
95 /*
96 * prefixes - implement various special prefixes
97 */
98 static void
prefixes(struct vars * v)99 prefixes(struct vars *v)
100 {
101 /* literal string doesn't get any of this stuff */
102 if (v->cflags & REG_QUOTE)
103 return;
104
105 /* initial "***" gets special things */
106 if (HAVE(4) && NEXT3('*', '*', '*'))
107 switch (*(v->now + 3))
108 {
109 case CHR('?'): /* "***?" error, msg shows version */
110 ERR(REG_BADPAT);
111 return; /* proceed no further */
112 break;
113 case CHR('='): /* "***=" shifts to literal string */
114 NOTE(REG_UNONPOSIX);
115 v->cflags |= REG_QUOTE;
116 v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
117 v->now += 4;
118 return; /* and there can be no more prefixes */
119 break;
120 case CHR(':'): /* "***:" shifts to AREs */
121 NOTE(REG_UNONPOSIX);
122 v->cflags |= REG_ADVANCED;
123 v->now += 4;
124 break;
125 default: /* otherwise *** is just an error */
126 ERR(REG_BADRPT);
127 return;
128 break;
129 }
130
131 /* BREs and EREs don't get embedded options */
132 if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133 return;
134
135 /* embedded options (AREs only) */
136 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137 {
138 NOTE(REG_UNONPOSIX);
139 v->now += 2;
140 for (; !ATEOS() && iscalpha(*v->now); v->now++)
141 switch (*v->now)
142 {
143 case CHR('b'): /* BREs (but why???) */
144 v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145 break;
146 case CHR('c'): /* case sensitive */
147 v->cflags &= ~REG_ICASE;
148 break;
149 case CHR('e'): /* plain EREs */
150 v->cflags |= REG_EXTENDED;
151 v->cflags &= ~(REG_ADVF | REG_QUOTE);
152 break;
153 case CHR('i'): /* case insensitive */
154 v->cflags |= REG_ICASE;
155 break;
156 case CHR('m'): /* Perloid synonym for n */
157 case CHR('n'): /* \n affects ^ $ . [^ */
158 v->cflags |= REG_NEWLINE;
159 break;
160 case CHR('p'): /* ~Perl, \n affects . [^ */
161 v->cflags |= REG_NLSTOP;
162 v->cflags &= ~REG_NLANCH;
163 break;
164 case CHR('q'): /* literal string */
165 v->cflags |= REG_QUOTE;
166 v->cflags &= ~REG_ADVANCED;
167 break;
168 case CHR('s'): /* single line, \n ordinary */
169 v->cflags &= ~REG_NEWLINE;
170 break;
171 case CHR('t'): /* tight syntax */
172 v->cflags &= ~REG_EXPANDED;
173 break;
174 case CHR('w'): /* weird, \n affects ^ $ only */
175 v->cflags &= ~REG_NLSTOP;
176 v->cflags |= REG_NLANCH;
177 break;
178 case CHR('x'): /* expanded syntax */
179 v->cflags |= REG_EXPANDED;
180 break;
181 default:
182 ERR(REG_BADOPT);
183 return;
184 }
185 if (!NEXT1(')'))
186 {
187 ERR(REG_BADOPT);
188 return;
189 }
190 v->now++;
191 if (v->cflags & REG_QUOTE)
192 v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193 }
194 }
195
196 /*
197 * lexnest - "call a subroutine", interpolating string at the lexical level
198 *
199 * Note, this is not a very general facility. There are a number of
200 * implicit assumptions about what sorts of strings can be subroutines.
201 */
202 static void
lexnest(struct vars * v,const chr * beginp,const chr * endp)203 lexnest(struct vars *v,
204 const chr *beginp, /* start of interpolation */
205 const chr *endp) /* one past end of interpolation */
206 {
207 assert(v->savenow == NULL); /* only one level of nesting */
208 v->savenow = v->now;
209 v->savestop = v->stop;
210 v->now = beginp;
211 v->stop = endp;
212 }
213
214 /*
215 * string constants to interpolate as expansions of things like \d
216 */
217 static const chr backd[] = { /* \d */
218 CHR('['), CHR('['), CHR(':'),
219 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220 CHR(':'), CHR(']'), CHR(']')
221 };
222 static const chr backD[] = { /* \D */
223 CHR('['), CHR('^'), CHR('['), CHR(':'),
224 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225 CHR(':'), CHR(']'), CHR(']')
226 };
227 static const chr brbackd[] = { /* \d within brackets */
228 CHR('['), CHR(':'),
229 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230 CHR(':'), CHR(']')
231 };
232 static const chr backs[] = { /* \s */
233 CHR('['), CHR('['), CHR(':'),
234 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235 CHR(':'), CHR(']'), CHR(']')
236 };
237 static const chr backS[] = { /* \S */
238 CHR('['), CHR('^'), CHR('['), CHR(':'),
239 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240 CHR(':'), CHR(']'), CHR(']')
241 };
242 static const chr brbacks[] = { /* \s within brackets */
243 CHR('['), CHR(':'),
244 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245 CHR(':'), CHR(']')
246 };
247 static const chr backw[] = { /* \w */
248 CHR('['), CHR('['), CHR(':'),
249 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250 CHR(':'), CHR(']'), CHR('_'), CHR(']')
251 };
252 static const chr backW[] = { /* \W */
253 CHR('['), CHR('^'), CHR('['), CHR(':'),
254 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255 CHR(':'), CHR(']'), CHR('_'), CHR(']')
256 };
257 static const chr brbackw[] = { /* \w within brackets */
258 CHR('['), CHR(':'),
259 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260 CHR(':'), CHR(']'), CHR('_')
261 };
262
263 /*
264 * lexword - interpolate a bracket expression for word characters
265 * Possibly ought to inquire whether there is a "word" character class.
266 */
267 static void
lexword(struct vars * v)268 lexword(struct vars *v)
269 {
270 lexnest(v, backw, ENDOF(backw));
271 }
272
273 /*
274 * next - get next token
275 */
276 static int /* 1 normal, 0 failure */
next(struct vars * v)277 next(struct vars *v)
278 {
279 chr c;
280
281 /* errors yield an infinite sequence of failures */
282 if (ISERR())
283 return 0; /* the error has set nexttype to EOS */
284
285 /* remember flavor of last token */
286 v->lasttype = v->nexttype;
287
288 /* REG_BOSONLY */
289 if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290 {
291 /* at start of a REG_BOSONLY RE */
292 RETV(SBEGIN, 0); /* same as \A */
293 }
294
295 /* if we're nested and we've hit end, return to outer level */
296 if (v->savenow != NULL && ATEOS())
297 {
298 v->now = v->savenow;
299 v->stop = v->savestop;
300 v->savenow = v->savestop = NULL;
301 }
302
303 /* skip white space etc. if appropriate (not in literal or []) */
304 if (v->cflags & REG_EXPANDED)
305 switch (v->lexcon)
306 {
307 case L_ERE:
308 case L_BRE:
309 case L_EBND:
310 case L_BBND:
311 skip(v);
312 break;
313 }
314
315 /* handle EOS, depending on context */
316 if (ATEOS())
317 {
318 switch (v->lexcon)
319 {
320 case L_ERE:
321 case L_BRE:
322 case L_Q:
323 RET(EOS);
324 break;
325 case L_EBND:
326 case L_BBND:
327 FAILW(REG_EBRACE);
328 break;
329 case L_BRACK:
330 case L_CEL:
331 case L_ECL:
332 case L_CCL:
333 FAILW(REG_EBRACK);
334 break;
335 }
336 assert(NOTREACHED);
337 }
338
339 /* okay, time to actually get a character */
340 c = *v->now++;
341
342 /* deal with the easy contexts, punt EREs to code below */
343 switch (v->lexcon)
344 {
345 case L_BRE: /* punt BREs to separate function */
346 return brenext(v, c);
347 break;
348 case L_ERE: /* see below */
349 break;
350 case L_Q: /* literal strings are easy */
351 RETV(PLAIN, c);
352 break;
353 case L_BBND: /* bounds are fairly simple */
354 case L_EBND:
355 switch (c)
356 {
357 case CHR('0'):
358 case CHR('1'):
359 case CHR('2'):
360 case CHR('3'):
361 case CHR('4'):
362 case CHR('5'):
363 case CHR('6'):
364 case CHR('7'):
365 case CHR('8'):
366 case CHR('9'):
367 RETV(DIGIT, (chr) DIGITVAL(c));
368 break;
369 case CHR(','):
370 RET(',');
371 break;
372 case CHR('}'): /* ERE bound ends with } */
373 if (INCON(L_EBND))
374 {
375 INTOCON(L_ERE);
376 if ((v->cflags & REG_ADVF) && NEXT1('?'))
377 {
378 v->now++;
379 NOTE(REG_UNONPOSIX);
380 RETV('}', 0);
381 }
382 RETV('}', 1);
383 }
384 else
385 FAILW(REG_BADBR);
386 break;
387 case CHR('\\'): /* BRE bound ends with \} */
388 if (INCON(L_BBND) && NEXT1('}'))
389 {
390 v->now++;
391 INTOCON(L_BRE);
392 RETV('}', 1);
393 }
394 else
395 FAILW(REG_BADBR);
396 break;
397 default:
398 FAILW(REG_BADBR);
399 break;
400 }
401 assert(NOTREACHED);
402 break;
403 case L_BRACK: /* brackets are not too hard */
404 switch (c)
405 {
406 case CHR(']'):
407 if (LASTTYPE('['))
408 RETV(PLAIN, c);
409 else
410 {
411 INTOCON((v->cflags & REG_EXTENDED) ?
412 L_ERE : L_BRE);
413 RET(']');
414 }
415 break;
416 case CHR('\\'):
417 NOTE(REG_UBBS);
418 if (!(v->cflags & REG_ADVF))
419 RETV(PLAIN, c);
420 NOTE(REG_UNONPOSIX);
421 if (ATEOS())
422 FAILW(REG_EESCAPE);
423 (DISCARD) lexescape(v);
424 switch (v->nexttype)
425 { /* not all escapes okay here */
426 case PLAIN:
427 return 1;
428 break;
429 case CCLASS:
430 switch (v->nextvalue)
431 {
432 case 'd':
433 lexnest(v, brbackd, ENDOF(brbackd));
434 break;
435 case 's':
436 lexnest(v, brbacks, ENDOF(brbacks));
437 break;
438 case 'w':
439 lexnest(v, brbackw, ENDOF(brbackw));
440 break;
441 default:
442 FAILW(REG_EESCAPE);
443 break;
444 }
445 /* lexnest done, back up and try again */
446 v->nexttype = v->lasttype;
447 return next(v);
448 break;
449 }
450 /* not one of the acceptable escapes */
451 FAILW(REG_EESCAPE);
452 break;
453 case CHR('-'):
454 if (LASTTYPE('[') || NEXT1(']'))
455 RETV(PLAIN, c);
456 else
457 RETV(RANGE, c);
458 break;
459 case CHR('['):
460 if (ATEOS())
461 FAILW(REG_EBRACK);
462 switch (*v->now++)
463 {
464 case CHR('.'):
465 INTOCON(L_CEL);
466 /* might or might not be locale-specific */
467 RET(COLLEL);
468 break;
469 case CHR('='):
470 INTOCON(L_ECL);
471 NOTE(REG_ULOCALE);
472 RET(ECLASS);
473 break;
474 case CHR(':'):
475 INTOCON(L_CCL);
476 NOTE(REG_ULOCALE);
477 RET(CCLASS);
478 break;
479 default: /* oops */
480 v->now--;
481 RETV(PLAIN, c);
482 break;
483 }
484 assert(NOTREACHED);
485 break;
486 default:
487 RETV(PLAIN, c);
488 break;
489 }
490 assert(NOTREACHED);
491 break;
492 case L_CEL: /* collating elements are easy */
493 if (c == CHR('.') && NEXT1(']'))
494 {
495 v->now++;
496 INTOCON(L_BRACK);
497 RETV(END, '.');
498 }
499 else
500 RETV(PLAIN, c);
501 break;
502 case L_ECL: /* ditto equivalence classes */
503 if (c == CHR('=') && NEXT1(']'))
504 {
505 v->now++;
506 INTOCON(L_BRACK);
507 RETV(END, '=');
508 }
509 else
510 RETV(PLAIN, c);
511 break;
512 case L_CCL: /* ditto character classes */
513 if (c == CHR(':') && NEXT1(']'))
514 {
515 v->now++;
516 INTOCON(L_BRACK);
517 RETV(END, ':');
518 }
519 else
520 RETV(PLAIN, c);
521 break;
522 default:
523 assert(NOTREACHED);
524 break;
525 }
526
527 /* that got rid of everything except EREs and AREs */
528 assert(INCON(L_ERE));
529
530 /* deal with EREs and AREs, except for backslashes */
531 switch (c)
532 {
533 case CHR('|'):
534 RET('|');
535 break;
536 case CHR('*'):
537 if ((v->cflags & REG_ADVF) && NEXT1('?'))
538 {
539 v->now++;
540 NOTE(REG_UNONPOSIX);
541 RETV('*', 0);
542 }
543 RETV('*', 1);
544 break;
545 case CHR('+'):
546 if ((v->cflags & REG_ADVF) && NEXT1('?'))
547 {
548 v->now++;
549 NOTE(REG_UNONPOSIX);
550 RETV('+', 0);
551 }
552 RETV('+', 1);
553 break;
554 case CHR('?'):
555 if ((v->cflags & REG_ADVF) && NEXT1('?'))
556 {
557 v->now++;
558 NOTE(REG_UNONPOSIX);
559 RETV('?', 0);
560 }
561 RETV('?', 1);
562 break;
563 case CHR('{'): /* bounds start or plain character */
564 if (v->cflags & REG_EXPANDED)
565 skip(v);
566 if (ATEOS() || !iscdigit(*v->now))
567 {
568 NOTE(REG_UBRACES);
569 NOTE(REG_UUNSPEC);
570 RETV(PLAIN, c);
571 }
572 else
573 {
574 NOTE(REG_UBOUNDS);
575 INTOCON(L_EBND);
576 RET('{');
577 }
578 assert(NOTREACHED);
579 break;
580 case CHR('('): /* parenthesis, or advanced extension */
581 if ((v->cflags & REG_ADVF) && NEXT1('?'))
582 {
583 NOTE(REG_UNONPOSIX);
584 v->now++;
585 if (ATEOS())
586 FAILW(REG_BADRPT);
587 switch (*v->now++)
588 {
589 case CHR(':'): /* non-capturing paren */
590 RETV('(', 0);
591 break;
592 case CHR('#'): /* comment */
593 while (!ATEOS() && *v->now != CHR(')'))
594 v->now++;
595 if (!ATEOS())
596 v->now++;
597 assert(v->nexttype == v->lasttype);
598 return next(v);
599 break;
600 case CHR('='): /* positive lookahead */
601 NOTE(REG_ULOOKAROUND);
602 RETV(LACON, LATYPE_AHEAD_POS);
603 break;
604 case CHR('!'): /* negative lookahead */
605 NOTE(REG_ULOOKAROUND);
606 RETV(LACON, LATYPE_AHEAD_NEG);
607 break;
608 case CHR('<'):
609 if (ATEOS())
610 FAILW(REG_BADRPT);
611 switch (*v->now++)
612 {
613 case CHR('='): /* positive lookbehind */
614 NOTE(REG_ULOOKAROUND);
615 RETV(LACON, LATYPE_BEHIND_POS);
616 break;
617 case CHR('!'): /* negative lookbehind */
618 NOTE(REG_ULOOKAROUND);
619 RETV(LACON, LATYPE_BEHIND_NEG);
620 break;
621 default:
622 FAILW(REG_BADRPT);
623 break;
624 }
625 assert(NOTREACHED);
626 break;
627 default:
628 FAILW(REG_BADRPT);
629 break;
630 }
631 assert(NOTREACHED);
632 }
633 if (v->cflags & REG_NOSUB)
634 RETV('(', 0); /* all parens non-capturing */
635 else
636 RETV('(', 1);
637 break;
638 case CHR(')'):
639 if (LASTTYPE('('))
640 NOTE(REG_UUNSPEC);
641 RETV(')', c);
642 break;
643 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
644 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
645 *(v->now + 1) == CHR(':') &&
646 (*(v->now + 2) == CHR('<') ||
647 *(v->now + 2) == CHR('>')) &&
648 *(v->now + 3) == CHR(':') &&
649 *(v->now + 4) == CHR(']') &&
650 *(v->now + 5) == CHR(']'))
651 {
652 c = *(v->now + 2);
653 v->now += 6;
654 NOTE(REG_UNONPOSIX);
655 RET((c == CHR('<')) ? '<' : '>');
656 }
657 INTOCON(L_BRACK);
658 if (NEXT1('^'))
659 {
660 v->now++;
661 RETV('[', 0);
662 }
663 RETV('[', 1);
664 break;
665 case CHR('.'):
666 RET('.');
667 break;
668 case CHR('^'):
669 RET('^');
670 break;
671 case CHR('$'):
672 RET('$');
673 break;
674 case CHR('\\'): /* mostly punt backslashes to code below */
675 if (ATEOS())
676 FAILW(REG_EESCAPE);
677 break;
678 default: /* ordinary character */
679 RETV(PLAIN, c);
680 break;
681 }
682
683 /* ERE/ARE backslash handling; backslash already eaten */
684 assert(!ATEOS());
685 if (!(v->cflags & REG_ADVF))
686 { /* only AREs have non-trivial escapes */
687 if (iscalnum(*v->now))
688 {
689 NOTE(REG_UBSALNUM);
690 NOTE(REG_UUNSPEC);
691 }
692 RETV(PLAIN, *v->now++);
693 }
694 (DISCARD) lexescape(v);
695 if (ISERR())
696 FAILW(REG_EESCAPE);
697 if (v->nexttype == CCLASS)
698 { /* fudge at lexical level */
699 switch (v->nextvalue)
700 {
701 case 'd':
702 lexnest(v, backd, ENDOF(backd));
703 break;
704 case 'D':
705 lexnest(v, backD, ENDOF(backD));
706 break;
707 case 's':
708 lexnest(v, backs, ENDOF(backs));
709 break;
710 case 'S':
711 lexnest(v, backS, ENDOF(backS));
712 break;
713 case 'w':
714 lexnest(v, backw, ENDOF(backw));
715 break;
716 case 'W':
717 lexnest(v, backW, ENDOF(backW));
718 break;
719 default:
720 assert(NOTREACHED);
721 FAILW(REG_ASSERT);
722 break;
723 }
724 /* lexnest done, back up and try again */
725 v->nexttype = v->lasttype;
726 return next(v);
727 }
728 /* otherwise, lexescape has already done the work */
729 return !ISERR();
730 }
731
732 /*
733 * lexescape - parse an ARE backslash escape (backslash already eaten)
734 * Note slightly nonstandard use of the CCLASS type code.
735 */
736 static int /* not actually used, but convenient for RETV */
lexescape(struct vars * v)737 lexescape(struct vars *v)
738 {
739 chr c;
740 static const chr alert[] = {
741 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
742 };
743 static const chr esc[] = {
744 CHR('E'), CHR('S'), CHR('C')
745 };
746 const chr *save;
747
748 assert(v->cflags & REG_ADVF);
749
750 assert(!ATEOS());
751 c = *v->now++;
752 if (!iscalnum(c))
753 RETV(PLAIN, c);
754
755 NOTE(REG_UNONPOSIX);
756 switch (c)
757 {
758 case CHR('a'):
759 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
760 break;
761 case CHR('A'):
762 RETV(SBEGIN, 0);
763 break;
764 case CHR('b'):
765 RETV(PLAIN, CHR('\b'));
766 break;
767 case CHR('B'):
768 RETV(PLAIN, CHR('\\'));
769 break;
770 case CHR('c'):
771 NOTE(REG_UUNPORT);
772 if (ATEOS())
773 FAILW(REG_EESCAPE);
774 RETV(PLAIN, (chr) (*v->now++ & 037));
775 break;
776 case CHR('d'):
777 NOTE(REG_ULOCALE);
778 RETV(CCLASS, 'd');
779 break;
780 case CHR('D'):
781 NOTE(REG_ULOCALE);
782 RETV(CCLASS, 'D');
783 break;
784 case CHR('e'):
785 NOTE(REG_UUNPORT);
786 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
787 break;
788 case CHR('f'):
789 RETV(PLAIN, CHR('\f'));
790 break;
791 case CHR('m'):
792 RET('<');
793 break;
794 case CHR('M'):
795 RET('>');
796 break;
797 case CHR('n'):
798 RETV(PLAIN, CHR('\n'));
799 break;
800 case CHR('r'):
801 RETV(PLAIN, CHR('\r'));
802 break;
803 case CHR('s'):
804 NOTE(REG_ULOCALE);
805 RETV(CCLASS, 's');
806 break;
807 case CHR('S'):
808 NOTE(REG_ULOCALE);
809 RETV(CCLASS, 'S');
810 break;
811 case CHR('t'):
812 RETV(PLAIN, CHR('\t'));
813 break;
814 case CHR('u'):
815 c = lexdigits(v, 16, 4, 4);
816 if (ISERR() || !CHR_IS_IN_RANGE(c))
817 FAILW(REG_EESCAPE);
818 RETV(PLAIN, c);
819 break;
820 case CHR('U'):
821 c = lexdigits(v, 16, 8, 8);
822 if (ISERR() || !CHR_IS_IN_RANGE(c))
823 FAILW(REG_EESCAPE);
824 RETV(PLAIN, c);
825 break;
826 case CHR('v'):
827 RETV(PLAIN, CHR('\v'));
828 break;
829 case CHR('w'):
830 NOTE(REG_ULOCALE);
831 RETV(CCLASS, 'w');
832 break;
833 case CHR('W'):
834 NOTE(REG_ULOCALE);
835 RETV(CCLASS, 'W');
836 break;
837 case CHR('x'):
838 NOTE(REG_UUNPORT);
839 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
840 if (ISERR() || !CHR_IS_IN_RANGE(c))
841 FAILW(REG_EESCAPE);
842 RETV(PLAIN, c);
843 break;
844 case CHR('y'):
845 NOTE(REG_ULOCALE);
846 RETV(WBDRY, 0);
847 break;
848 case CHR('Y'):
849 NOTE(REG_ULOCALE);
850 RETV(NWBDRY, 0);
851 break;
852 case CHR('Z'):
853 RETV(SEND, 0);
854 break;
855 case CHR('1'):
856 case CHR('2'):
857 case CHR('3'):
858 case CHR('4'):
859 case CHR('5'):
860 case CHR('6'):
861 case CHR('7'):
862 case CHR('8'):
863 case CHR('9'):
864 save = v->now;
865 v->now--; /* put first digit back */
866 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
867 if (ISERR())
868 FAILW(REG_EESCAPE);
869 /* ugly heuristic (first test is "exactly 1 digit?") */
870 if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
871 {
872 NOTE(REG_UBACKREF);
873 RETV(BACKREF, c);
874 }
875 /* oops, doesn't look like it's a backref after all... */
876 v->now = save;
877 /* and fall through into octal number */
878 case CHR('0'):
879 NOTE(REG_UUNPORT);
880 v->now--; /* put first digit back */
881 c = lexdigits(v, 8, 1, 3);
882 if (ISERR())
883 FAILW(REG_EESCAPE);
884 if (c > 0xff)
885 {
886 /* out of range, so we handled one digit too much */
887 v->now--;
888 c >>= 3;
889 }
890 RETV(PLAIN, c);
891 break;
892 default:
893 assert(iscalpha(c));
894 FAILW(REG_EESCAPE); /* unknown alphabetic escape */
895 break;
896 }
897 assert(NOTREACHED);
898 }
899
900 /*
901 * lexdigits - slurp up digits and return chr value
902 *
903 * This does not account for overflow; callers should range-check the result
904 * if maxlen is large enough to make that possible.
905 */
906 static chr /* chr value; errors signalled via ERR */
lexdigits(struct vars * v,int base,int minlen,int maxlen)907 lexdigits(struct vars *v,
908 int base,
909 int minlen,
910 int maxlen)
911 {
912 uchr n; /* unsigned to avoid overflow misbehavior */
913 int len;
914 chr c;
915 int d;
916 const uchr ub = (uchr) base;
917
918 n = 0;
919 for (len = 0; len < maxlen && !ATEOS(); len++)
920 {
921 c = *v->now++;
922 switch (c)
923 {
924 case CHR('0'):
925 case CHR('1'):
926 case CHR('2'):
927 case CHR('3'):
928 case CHR('4'):
929 case CHR('5'):
930 case CHR('6'):
931 case CHR('7'):
932 case CHR('8'):
933 case CHR('9'):
934 d = DIGITVAL(c);
935 break;
936 case CHR('a'):
937 case CHR('A'):
938 d = 10;
939 break;
940 case CHR('b'):
941 case CHR('B'):
942 d = 11;
943 break;
944 case CHR('c'):
945 case CHR('C'):
946 d = 12;
947 break;
948 case CHR('d'):
949 case CHR('D'):
950 d = 13;
951 break;
952 case CHR('e'):
953 case CHR('E'):
954 d = 14;
955 break;
956 case CHR('f'):
957 case CHR('F'):
958 d = 15;
959 break;
960 default:
961 v->now--; /* oops, not a digit at all */
962 d = -1;
963 break;
964 }
965
966 if (d >= base)
967 { /* not a plausible digit */
968 v->now--;
969 d = -1;
970 }
971 if (d < 0)
972 break; /* NOTE BREAK OUT */
973 n = n * ub + (uchr) d;
974 }
975 if (len < minlen)
976 ERR(REG_EESCAPE);
977
978 return (chr) n;
979 }
980
981 /*
982 * brenext - get next BRE token
983 *
984 * This is much like EREs except for all the stupid backslashes and the
985 * context-dependency of some things.
986 */
987 static int /* 1 normal, 0 failure */
brenext(struct vars * v,chr c)988 brenext(struct vars *v,
989 chr c)
990 {
991 switch (c)
992 {
993 case CHR('*'):
994 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
995 RETV(PLAIN, c);
996 RETV('*', 1);
997 break;
998 case CHR('['):
999 if (HAVE(6) && *(v->now + 0) == CHR('[') &&
1000 *(v->now + 1) == CHR(':') &&
1001 (*(v->now + 2) == CHR('<') ||
1002 *(v->now + 2) == CHR('>')) &&
1003 *(v->now + 3) == CHR(':') &&
1004 *(v->now + 4) == CHR(']') &&
1005 *(v->now + 5) == CHR(']'))
1006 {
1007 c = *(v->now + 2);
1008 v->now += 6;
1009 NOTE(REG_UNONPOSIX);
1010 RET((c == CHR('<')) ? '<' : '>');
1011 }
1012 INTOCON(L_BRACK);
1013 if (NEXT1('^'))
1014 {
1015 v->now++;
1016 RETV('[', 0);
1017 }
1018 RETV('[', 1);
1019 break;
1020 case CHR('.'):
1021 RET('.');
1022 break;
1023 case CHR('^'):
1024 if (LASTTYPE(EMPTY))
1025 RET('^');
1026 if (LASTTYPE('('))
1027 {
1028 NOTE(REG_UUNSPEC);
1029 RET('^');
1030 }
1031 RETV(PLAIN, c);
1032 break;
1033 case CHR('$'):
1034 if (v->cflags & REG_EXPANDED)
1035 skip(v);
1036 if (ATEOS())
1037 RET('$');
1038 if (NEXT2('\\', ')'))
1039 {
1040 NOTE(REG_UUNSPEC);
1041 RET('$');
1042 }
1043 RETV(PLAIN, c);
1044 break;
1045 case CHR('\\'):
1046 break; /* see below */
1047 default:
1048 RETV(PLAIN, c);
1049 break;
1050 }
1051
1052 assert(c == CHR('\\'));
1053
1054 if (ATEOS())
1055 FAILW(REG_EESCAPE);
1056
1057 c = *v->now++;
1058 switch (c)
1059 {
1060 case CHR('{'):
1061 INTOCON(L_BBND);
1062 NOTE(REG_UBOUNDS);
1063 RET('{');
1064 break;
1065 case CHR('('):
1066 RETV('(', 1);
1067 break;
1068 case CHR(')'):
1069 RETV(')', c);
1070 break;
1071 case CHR('<'):
1072 NOTE(REG_UNONPOSIX);
1073 RET('<');
1074 break;
1075 case CHR('>'):
1076 NOTE(REG_UNONPOSIX);
1077 RET('>');
1078 break;
1079 case CHR('1'):
1080 case CHR('2'):
1081 case CHR('3'):
1082 case CHR('4'):
1083 case CHR('5'):
1084 case CHR('6'):
1085 case CHR('7'):
1086 case CHR('8'):
1087 case CHR('9'):
1088 NOTE(REG_UBACKREF);
1089 RETV(BACKREF, (chr) DIGITVAL(c));
1090 break;
1091 default:
1092 if (iscalnum(c))
1093 {
1094 NOTE(REG_UBSALNUM);
1095 NOTE(REG_UUNSPEC);
1096 }
1097 RETV(PLAIN, c);
1098 break;
1099 }
1100
1101 assert(NOTREACHED);
1102 return 0;
1103 }
1104
1105 /*
1106 * skip - skip white space and comments in expanded form
1107 */
1108 static void
skip(struct vars * v)1109 skip(struct vars *v)
1110 {
1111 const chr *start = v->now;
1112
1113 assert(v->cflags & REG_EXPANDED);
1114
1115 for (;;)
1116 {
1117 while (!ATEOS() && iscspace(*v->now))
1118 v->now++;
1119 if (ATEOS() || *v->now != CHR('#'))
1120 break; /* NOTE BREAK OUT */
1121 assert(NEXT1('#'));
1122 while (!ATEOS() && *v->now != CHR('\n'))
1123 v->now++;
1124 /* leave the newline to be picked up by the iscspace loop */
1125 }
1126
1127 if (v->now != start)
1128 NOTE(REG_UNONPOSIX);
1129 }
1130
1131 /*
1132 * newline - return the chr for a newline
1133 *
1134 * This helps confine use of CHR to this source file.
1135 */
1136 static chr
newline(void)1137 newline(void)
1138 {
1139 return CHR('\n');
1140 }
1141
1142 /*
1143 * chrnamed - return the chr known by a given (chr string) name
1144 *
1145 * The code is a bit clumsy, but this routine gets only such specialized
1146 * use that it hardly matters.
1147 */
1148 static chr
chrnamed(struct vars * v,const chr * startp,const chr * endp,chr lastresort)1149 chrnamed(struct vars *v,
1150 const chr *startp, /* start of name */
1151 const chr *endp, /* just past end of name */
1152 chr lastresort) /* what to return if name lookup fails */
1153 {
1154 chr c;
1155 int errsave;
1156 int e;
1157 struct cvec *cv;
1158
1159 errsave = v->err;
1160 v->err = 0;
1161 c = element(v, startp, endp);
1162 e = v->err;
1163 v->err = errsave;
1164
1165 if (e != 0)
1166 return lastresort;
1167
1168 cv = range(v, c, c, 0);
1169 if (cv->nchrs == 0)
1170 return lastresort;
1171 return cv->chrs[0];
1172 }
1173