1 /*
2  * lexical analyzer
3  * This file is #included by regcomp.c.
4  *
5  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6  *
7  * Development of this software was funded, in part, by Cray Research Inc.,
8  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9  * Corporation, none of whom are responsible for the results.  The author
10  * thanks all of them.
11  *
12  * Redistribution and use in source and binary forms -- with or without
13  * modification -- are permitted for any purpose, provided that
14  * redistributions in source form retain this entire copyright notice and
15  * indicate the origin and nature of any modifications.
16  *
17  * I'd appreciate being given credit for this package in the documentation
18  * of software which uses it, but that is not a requirement.
19  *
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * src/backend/regex/regc_lex.c
32  *
33  */
34 
35 /* scanning macros (know about v) */
36 #define ATEOS()		(v->now >= v->stop)
37 #define HAVE(n)		(v->stop - v->now >= (n))
38 #define NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
39 #define NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
40 #define NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
41 						*(v->now+1) == CHR(b) && \
42 						*(v->now+2) == CHR(c))
43 #define SET(c)		(v->nexttype = (c))
44 #define SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
45 #define RET(c)		return (SET(c), 1)
46 #define RETV(c, n)	return (SETV(c, n), 1)
47 #define FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
48 #define LASTTYPE(t) (v->lasttype == (t))
49 
50 /* lexical contexts */
51 #define L_ERE	1				/* mainline ERE/ARE */
52 #define L_BRE	2				/* mainline BRE */
53 #define L_Q 3					/* REG_QUOTE */
54 #define L_EBND	4				/* ERE/ARE bound */
55 #define L_BBND	5				/* BRE bound */
56 #define L_BRACK 6				/* brackets */
57 #define L_CEL	7				/* collating element */
58 #define L_ECL	8				/* equivalence class */
59 #define L_CCL	9				/* character class */
60 #define INTOCON(c)	(v->lexcon = (c))
61 #define INCON(con)	(v->lexcon == (con))
62 
63 /* construct pointer past end of chr array */
64 #define ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
65 
66 /*
67  * lexstart - set up lexical stuff, scan leading options
68  */
69 static void
lexstart(struct vars * v)70 lexstart(struct vars *v)
71 {
72 	prefixes(v);				/* may turn on new type bits etc. */
73 	NOERR();
74 
75 	if (v->cflags & REG_QUOTE)
76 	{
77 		assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
78 		INTOCON(L_Q);
79 	}
80 	else if (v->cflags & REG_EXTENDED)
81 	{
82 		assert(!(v->cflags & REG_QUOTE));
83 		INTOCON(L_ERE);
84 	}
85 	else
86 	{
87 		assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
88 		INTOCON(L_BRE);
89 	}
90 
91 	v->nexttype = EMPTY;		/* remember we were at the start */
92 	next(v);					/* set up the first token */
93 }
94 
95 /*
96  * prefixes - implement various special prefixes
97  */
98 static void
prefixes(struct vars * v)99 prefixes(struct vars *v)
100 {
101 	/* literal string doesn't get any of this stuff */
102 	if (v->cflags & REG_QUOTE)
103 		return;
104 
105 	/* initial "***" gets special things */
106 	if (HAVE(4) && NEXT3('*', '*', '*'))
107 		switch (*(v->now + 3))
108 		{
109 			case CHR('?'):		/* "***?" error, msg shows version */
110 				ERR(REG_BADPAT);
111 				return;			/* proceed no further */
112 				break;
113 			case CHR('='):		/* "***=" shifts to literal string */
114 				NOTE(REG_UNONPOSIX);
115 				v->cflags |= REG_QUOTE;
116 				v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
117 				v->now += 4;
118 				return;			/* and there can be no more prefixes */
119 				break;
120 			case CHR(':'):		/* "***:" shifts to AREs */
121 				NOTE(REG_UNONPOSIX);
122 				v->cflags |= REG_ADVANCED;
123 				v->now += 4;
124 				break;
125 			default:			/* otherwise *** is just an error */
126 				ERR(REG_BADRPT);
127 				return;
128 				break;
129 		}
130 
131 	/* BREs and EREs don't get embedded options */
132 	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
133 		return;
134 
135 	/* embedded options (AREs only) */
136 	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
137 	{
138 		NOTE(REG_UNONPOSIX);
139 		v->now += 2;
140 		for (; !ATEOS() && iscalpha(*v->now); v->now++)
141 			switch (*v->now)
142 			{
143 				case CHR('b'):	/* BREs (but why???) */
144 					v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
145 					break;
146 				case CHR('c'):	/* case sensitive */
147 					v->cflags &= ~REG_ICASE;
148 					break;
149 				case CHR('e'):	/* plain EREs */
150 					v->cflags |= REG_EXTENDED;
151 					v->cflags &= ~(REG_ADVF | REG_QUOTE);
152 					break;
153 				case CHR('i'):	/* case insensitive */
154 					v->cflags |= REG_ICASE;
155 					break;
156 				case CHR('m'):	/* Perloid synonym for n */
157 				case CHR('n'):	/* \n affects ^ $ . [^ */
158 					v->cflags |= REG_NEWLINE;
159 					break;
160 				case CHR('p'):	/* ~Perl, \n affects . [^ */
161 					v->cflags |= REG_NLSTOP;
162 					v->cflags &= ~REG_NLANCH;
163 					break;
164 				case CHR('q'):	/* literal string */
165 					v->cflags |= REG_QUOTE;
166 					v->cflags &= ~REG_ADVANCED;
167 					break;
168 				case CHR('s'):	/* single line, \n ordinary */
169 					v->cflags &= ~REG_NEWLINE;
170 					break;
171 				case CHR('t'):	/* tight syntax */
172 					v->cflags &= ~REG_EXPANDED;
173 					break;
174 				case CHR('w'):	/* weird, \n affects ^ $ only */
175 					v->cflags &= ~REG_NLSTOP;
176 					v->cflags |= REG_NLANCH;
177 					break;
178 				case CHR('x'):	/* expanded syntax */
179 					v->cflags |= REG_EXPANDED;
180 					break;
181 				default:
182 					ERR(REG_BADOPT);
183 					return;
184 			}
185 		if (!NEXT1(')'))
186 		{
187 			ERR(REG_BADOPT);
188 			return;
189 		}
190 		v->now++;
191 		if (v->cflags & REG_QUOTE)
192 			v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
193 	}
194 }
195 
196 /*
197  * lexnest - "call a subroutine", interpolating string at the lexical level
198  *
199  * Note, this is not a very general facility.  There are a number of
200  * implicit assumptions about what sorts of strings can be subroutines.
201  */
202 static void
lexnest(struct vars * v,const chr * beginp,const chr * endp)203 lexnest(struct vars *v,
204 		const chr *beginp,		/* start of interpolation */
205 		const chr *endp)		/* one past end of interpolation */
206 {
207 	assert(v->savenow == NULL); /* only one level of nesting */
208 	v->savenow = v->now;
209 	v->savestop = v->stop;
210 	v->now = beginp;
211 	v->stop = endp;
212 }
213 
214 /*
215  * string constants to interpolate as expansions of things like \d
216  */
217 static const chr backd[] = {	/* \d */
218 	CHR('['), CHR('['), CHR(':'),
219 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
220 	CHR(':'), CHR(']'), CHR(']')
221 };
222 static const chr backD[] = {	/* \D */
223 	CHR('['), CHR('^'), CHR('['), CHR(':'),
224 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
225 	CHR(':'), CHR(']'), CHR(']')
226 };
227 static const chr brbackd[] = {	/* \d within brackets */
228 	CHR('['), CHR(':'),
229 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
230 	CHR(':'), CHR(']')
231 };
232 static const chr backs[] = {	/* \s */
233 	CHR('['), CHR('['), CHR(':'),
234 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
235 	CHR(':'), CHR(']'), CHR(']')
236 };
237 static const chr backS[] = {	/* \S */
238 	CHR('['), CHR('^'), CHR('['), CHR(':'),
239 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
240 	CHR(':'), CHR(']'), CHR(']')
241 };
242 static const chr brbacks[] = {	/* \s within brackets */
243 	CHR('['), CHR(':'),
244 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
245 	CHR(':'), CHR(']')
246 };
247 static const chr backw[] = {	/* \w */
248 	CHR('['), CHR('['), CHR(':'),
249 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
250 	CHR(':'), CHR(']'), CHR('_'), CHR(']')
251 };
252 static const chr backW[] = {	/* \W */
253 	CHR('['), CHR('^'), CHR('['), CHR(':'),
254 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
255 	CHR(':'), CHR(']'), CHR('_'), CHR(']')
256 };
257 static const chr brbackw[] = {	/* \w within brackets */
258 	CHR('['), CHR(':'),
259 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
260 	CHR(':'), CHR(']'), CHR('_')
261 };
262 
263 /*
264  * lexword - interpolate a bracket expression for word characters
265  * Possibly ought to inquire whether there is a "word" character class.
266  */
267 static void
lexword(struct vars * v)268 lexword(struct vars *v)
269 {
270 	lexnest(v, backw, ENDOF(backw));
271 }
272 
273 /*
274  * next - get next token
275  */
276 static int						/* 1 normal, 0 failure */
next(struct vars * v)277 next(struct vars *v)
278 {
279 	chr			c;
280 
281 	/* errors yield an infinite sequence of failures */
282 	if (ISERR())
283 		return 0;				/* the error has set nexttype to EOS */
284 
285 	/* remember flavor of last token */
286 	v->lasttype = v->nexttype;
287 
288 	/* REG_BOSONLY */
289 	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
290 	{
291 		/* at start of a REG_BOSONLY RE */
292 		RETV(SBEGIN, 0);		/* same as \A */
293 	}
294 
295 	/* if we're nested and we've hit end, return to outer level */
296 	if (v->savenow != NULL && ATEOS())
297 	{
298 		v->now = v->savenow;
299 		v->stop = v->savestop;
300 		v->savenow = v->savestop = NULL;
301 	}
302 
303 	/* skip white space etc. if appropriate (not in literal or []) */
304 	if (v->cflags & REG_EXPANDED)
305 		switch (v->lexcon)
306 		{
307 			case L_ERE:
308 			case L_BRE:
309 			case L_EBND:
310 			case L_BBND:
311 				skip(v);
312 				break;
313 		}
314 
315 	/* handle EOS, depending on context */
316 	if (ATEOS())
317 	{
318 		switch (v->lexcon)
319 		{
320 			case L_ERE:
321 			case L_BRE:
322 			case L_Q:
323 				RET(EOS);
324 				break;
325 			case L_EBND:
326 			case L_BBND:
327 				FAILW(REG_EBRACE);
328 				break;
329 			case L_BRACK:
330 			case L_CEL:
331 			case L_ECL:
332 			case L_CCL:
333 				FAILW(REG_EBRACK);
334 				break;
335 		}
336 		assert(NOTREACHED);
337 	}
338 
339 	/* okay, time to actually get a character */
340 	c = *v->now++;
341 
342 	/* deal with the easy contexts, punt EREs to code below */
343 	switch (v->lexcon)
344 	{
345 		case L_BRE:				/* punt BREs to separate function */
346 			return brenext(v, c);
347 			break;
348 		case L_ERE:				/* see below */
349 			break;
350 		case L_Q:				/* literal strings are easy */
351 			RETV(PLAIN, c);
352 			break;
353 		case L_BBND:			/* bounds are fairly simple */
354 		case L_EBND:
355 			switch (c)
356 			{
357 				case CHR('0'):
358 				case CHR('1'):
359 				case CHR('2'):
360 				case CHR('3'):
361 				case CHR('4'):
362 				case CHR('5'):
363 				case CHR('6'):
364 				case CHR('7'):
365 				case CHR('8'):
366 				case CHR('9'):
367 					RETV(DIGIT, (chr) DIGITVAL(c));
368 					break;
369 				case CHR(','):
370 					RET(',');
371 					break;
372 				case CHR('}'):	/* ERE bound ends with } */
373 					if (INCON(L_EBND))
374 					{
375 						INTOCON(L_ERE);
376 						if ((v->cflags & REG_ADVF) && NEXT1('?'))
377 						{
378 							v->now++;
379 							NOTE(REG_UNONPOSIX);
380 							RETV('}', 0);
381 						}
382 						RETV('}', 1);
383 					}
384 					else
385 						FAILW(REG_BADBR);
386 					break;
387 				case CHR('\\'): /* BRE bound ends with \} */
388 					if (INCON(L_BBND) && NEXT1('}'))
389 					{
390 						v->now++;
391 						INTOCON(L_BRE);
392 						RETV('}', 1);
393 					}
394 					else
395 						FAILW(REG_BADBR);
396 					break;
397 				default:
398 					FAILW(REG_BADBR);
399 					break;
400 			}
401 			assert(NOTREACHED);
402 			break;
403 		case L_BRACK:			/* brackets are not too hard */
404 			switch (c)
405 			{
406 				case CHR(']'):
407 					if (LASTTYPE('['))
408 						RETV(PLAIN, c);
409 					else
410 					{
411 						INTOCON((v->cflags & REG_EXTENDED) ?
412 								L_ERE : L_BRE);
413 						RET(']');
414 					}
415 					break;
416 				case CHR('\\'):
417 					NOTE(REG_UBBS);
418 					if (!(v->cflags & REG_ADVF))
419 						RETV(PLAIN, c);
420 					NOTE(REG_UNONPOSIX);
421 					if (ATEOS())
422 						FAILW(REG_EESCAPE);
423 					(DISCARD) lexescape(v);
424 					switch (v->nexttype)
425 					{			/* not all escapes okay here */
426 						case PLAIN:
427 							return 1;
428 							break;
429 						case CCLASS:
430 							switch (v->nextvalue)
431 							{
432 								case 'd':
433 									lexnest(v, brbackd, ENDOF(brbackd));
434 									break;
435 								case 's':
436 									lexnest(v, brbacks, ENDOF(brbacks));
437 									break;
438 								case 'w':
439 									lexnest(v, brbackw, ENDOF(brbackw));
440 									break;
441 								default:
442 									FAILW(REG_EESCAPE);
443 									break;
444 							}
445 							/* lexnest done, back up and try again */
446 							v->nexttype = v->lasttype;
447 							return next(v);
448 							break;
449 					}
450 					/* not one of the acceptable escapes */
451 					FAILW(REG_EESCAPE);
452 					break;
453 				case CHR('-'):
454 					if (LASTTYPE('[') || NEXT1(']'))
455 						RETV(PLAIN, c);
456 					else
457 						RETV(RANGE, c);
458 					break;
459 				case CHR('['):
460 					if (ATEOS())
461 						FAILW(REG_EBRACK);
462 					switch (*v->now++)
463 					{
464 						case CHR('.'):
465 							INTOCON(L_CEL);
466 							/* might or might not be locale-specific */
467 							RET(COLLEL);
468 							break;
469 						case CHR('='):
470 							INTOCON(L_ECL);
471 							NOTE(REG_ULOCALE);
472 							RET(ECLASS);
473 							break;
474 						case CHR(':'):
475 							INTOCON(L_CCL);
476 							NOTE(REG_ULOCALE);
477 							RET(CCLASS);
478 							break;
479 						default:	/* oops */
480 							v->now--;
481 							RETV(PLAIN, c);
482 							break;
483 					}
484 					assert(NOTREACHED);
485 					break;
486 				default:
487 					RETV(PLAIN, c);
488 					break;
489 			}
490 			assert(NOTREACHED);
491 			break;
492 		case L_CEL:				/* collating elements are easy */
493 			if (c == CHR('.') && NEXT1(']'))
494 			{
495 				v->now++;
496 				INTOCON(L_BRACK);
497 				RETV(END, '.');
498 			}
499 			else
500 				RETV(PLAIN, c);
501 			break;
502 		case L_ECL:				/* ditto equivalence classes */
503 			if (c == CHR('=') && NEXT1(']'))
504 			{
505 				v->now++;
506 				INTOCON(L_BRACK);
507 				RETV(END, '=');
508 			}
509 			else
510 				RETV(PLAIN, c);
511 			break;
512 		case L_CCL:				/* ditto character classes */
513 			if (c == CHR(':') && NEXT1(']'))
514 			{
515 				v->now++;
516 				INTOCON(L_BRACK);
517 				RETV(END, ':');
518 			}
519 			else
520 				RETV(PLAIN, c);
521 			break;
522 		default:
523 			assert(NOTREACHED);
524 			break;
525 	}
526 
527 	/* that got rid of everything except EREs and AREs */
528 	assert(INCON(L_ERE));
529 
530 	/* deal with EREs and AREs, except for backslashes */
531 	switch (c)
532 	{
533 		case CHR('|'):
534 			RET('|');
535 			break;
536 		case CHR('*'):
537 			if ((v->cflags & REG_ADVF) && NEXT1('?'))
538 			{
539 				v->now++;
540 				NOTE(REG_UNONPOSIX);
541 				RETV('*', 0);
542 			}
543 			RETV('*', 1);
544 			break;
545 		case CHR('+'):
546 			if ((v->cflags & REG_ADVF) && NEXT1('?'))
547 			{
548 				v->now++;
549 				NOTE(REG_UNONPOSIX);
550 				RETV('+', 0);
551 			}
552 			RETV('+', 1);
553 			break;
554 		case CHR('?'):
555 			if ((v->cflags & REG_ADVF) && NEXT1('?'))
556 			{
557 				v->now++;
558 				NOTE(REG_UNONPOSIX);
559 				RETV('?', 0);
560 			}
561 			RETV('?', 1);
562 			break;
563 		case CHR('{'):			/* bounds start or plain character */
564 			if (v->cflags & REG_EXPANDED)
565 				skip(v);
566 			if (ATEOS() || !iscdigit(*v->now))
567 			{
568 				NOTE(REG_UBRACES);
569 				NOTE(REG_UUNSPEC);
570 				RETV(PLAIN, c);
571 			}
572 			else
573 			{
574 				NOTE(REG_UBOUNDS);
575 				INTOCON(L_EBND);
576 				RET('{');
577 			}
578 			assert(NOTREACHED);
579 			break;
580 		case CHR('('):			/* parenthesis, or advanced extension */
581 			if ((v->cflags & REG_ADVF) && NEXT1('?'))
582 			{
583 				NOTE(REG_UNONPOSIX);
584 				v->now++;
585 				if (ATEOS())
586 					FAILW(REG_BADRPT);
587 				switch (*v->now++)
588 				{
589 					case CHR(':'):	/* non-capturing paren */
590 						RETV('(', 0);
591 						break;
592 					case CHR('#'):	/* comment */
593 						while (!ATEOS() && *v->now != CHR(')'))
594 							v->now++;
595 						if (!ATEOS())
596 							v->now++;
597 						assert(v->nexttype == v->lasttype);
598 						return next(v);
599 						break;
600 					case CHR('='):	/* positive lookahead */
601 						NOTE(REG_ULOOKAROUND);
602 						RETV(LACON, LATYPE_AHEAD_POS);
603 						break;
604 					case CHR('!'):	/* negative lookahead */
605 						NOTE(REG_ULOOKAROUND);
606 						RETV(LACON, LATYPE_AHEAD_NEG);
607 						break;
608 					case CHR('<'):
609 						if (ATEOS())
610 							FAILW(REG_BADRPT);
611 						switch (*v->now++)
612 						{
613 							case CHR('='):	/* positive lookbehind */
614 								NOTE(REG_ULOOKAROUND);
615 								RETV(LACON, LATYPE_BEHIND_POS);
616 								break;
617 							case CHR('!'):	/* negative lookbehind */
618 								NOTE(REG_ULOOKAROUND);
619 								RETV(LACON, LATYPE_BEHIND_NEG);
620 								break;
621 							default:
622 								FAILW(REG_BADRPT);
623 								break;
624 						}
625 						assert(NOTREACHED);
626 						break;
627 					default:
628 						FAILW(REG_BADRPT);
629 						break;
630 				}
631 				assert(NOTREACHED);
632 			}
633 			if (v->cflags & REG_NOSUB)
634 				RETV('(', 0);	/* all parens non-capturing */
635 			else
636 				RETV('(', 1);
637 			break;
638 		case CHR(')'):
639 			if (LASTTYPE('('))
640 				NOTE(REG_UUNSPEC);
641 			RETV(')', c);
642 			break;
643 		case CHR('['):			/* easy except for [[:<:]] and [[:>:]] */
644 			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
645 				*(v->now + 1) == CHR(':') &&
646 				(*(v->now + 2) == CHR('<') ||
647 				 *(v->now + 2) == CHR('>')) &&
648 				*(v->now + 3) == CHR(':') &&
649 				*(v->now + 4) == CHR(']') &&
650 				*(v->now + 5) == CHR(']'))
651 			{
652 				c = *(v->now + 2);
653 				v->now += 6;
654 				NOTE(REG_UNONPOSIX);
655 				RET((c == CHR('<')) ? '<' : '>');
656 			}
657 			INTOCON(L_BRACK);
658 			if (NEXT1('^'))
659 			{
660 				v->now++;
661 				RETV('[', 0);
662 			}
663 			RETV('[', 1);
664 			break;
665 		case CHR('.'):
666 			RET('.');
667 			break;
668 		case CHR('^'):
669 			RET('^');
670 			break;
671 		case CHR('$'):
672 			RET('$');
673 			break;
674 		case CHR('\\'):			/* mostly punt backslashes to code below */
675 			if (ATEOS())
676 				FAILW(REG_EESCAPE);
677 			break;
678 		default:				/* ordinary character */
679 			RETV(PLAIN, c);
680 			break;
681 	}
682 
683 	/* ERE/ARE backslash handling; backslash already eaten */
684 	assert(!ATEOS());
685 	if (!(v->cflags & REG_ADVF))
686 	{							/* only AREs have non-trivial escapes */
687 		if (iscalnum(*v->now))
688 		{
689 			NOTE(REG_UBSALNUM);
690 			NOTE(REG_UUNSPEC);
691 		}
692 		RETV(PLAIN, *v->now++);
693 	}
694 	(DISCARD) lexescape(v);
695 	if (ISERR())
696 		FAILW(REG_EESCAPE);
697 	if (v->nexttype == CCLASS)
698 	{							/* fudge at lexical level */
699 		switch (v->nextvalue)
700 		{
701 			case 'd':
702 				lexnest(v, backd, ENDOF(backd));
703 				break;
704 			case 'D':
705 				lexnest(v, backD, ENDOF(backD));
706 				break;
707 			case 's':
708 				lexnest(v, backs, ENDOF(backs));
709 				break;
710 			case 'S':
711 				lexnest(v, backS, ENDOF(backS));
712 				break;
713 			case 'w':
714 				lexnest(v, backw, ENDOF(backw));
715 				break;
716 			case 'W':
717 				lexnest(v, backW, ENDOF(backW));
718 				break;
719 			default:
720 				assert(NOTREACHED);
721 				FAILW(REG_ASSERT);
722 				break;
723 		}
724 		/* lexnest done, back up and try again */
725 		v->nexttype = v->lasttype;
726 		return next(v);
727 	}
728 	/* otherwise, lexescape has already done the work */
729 	return !ISERR();
730 }
731 
732 /*
733  * lexescape - parse an ARE backslash escape (backslash already eaten)
734  * Note slightly nonstandard use of the CCLASS type code.
735  */
736 static int						/* not actually used, but convenient for RETV */
lexescape(struct vars * v)737 lexescape(struct vars *v)
738 {
739 	chr			c;
740 	static const chr alert[] = {
741 		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
742 	};
743 	static const chr esc[] = {
744 		CHR('E'), CHR('S'), CHR('C')
745 	};
746 	const chr  *save;
747 
748 	assert(v->cflags & REG_ADVF);
749 
750 	assert(!ATEOS());
751 	c = *v->now++;
752 	if (!iscalnum(c))
753 		RETV(PLAIN, c);
754 
755 	NOTE(REG_UNONPOSIX);
756 	switch (c)
757 	{
758 		case CHR('a'):
759 			RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
760 			break;
761 		case CHR('A'):
762 			RETV(SBEGIN, 0);
763 			break;
764 		case CHR('b'):
765 			RETV(PLAIN, CHR('\b'));
766 			break;
767 		case CHR('B'):
768 			RETV(PLAIN, CHR('\\'));
769 			break;
770 		case CHR('c'):
771 			NOTE(REG_UUNPORT);
772 			if (ATEOS())
773 				FAILW(REG_EESCAPE);
774 			RETV(PLAIN, (chr) (*v->now++ & 037));
775 			break;
776 		case CHR('d'):
777 			NOTE(REG_ULOCALE);
778 			RETV(CCLASS, 'd');
779 			break;
780 		case CHR('D'):
781 			NOTE(REG_ULOCALE);
782 			RETV(CCLASS, 'D');
783 			break;
784 		case CHR('e'):
785 			NOTE(REG_UUNPORT);
786 			RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
787 			break;
788 		case CHR('f'):
789 			RETV(PLAIN, CHR('\f'));
790 			break;
791 		case CHR('m'):
792 			RET('<');
793 			break;
794 		case CHR('M'):
795 			RET('>');
796 			break;
797 		case CHR('n'):
798 			RETV(PLAIN, CHR('\n'));
799 			break;
800 		case CHR('r'):
801 			RETV(PLAIN, CHR('\r'));
802 			break;
803 		case CHR('s'):
804 			NOTE(REG_ULOCALE);
805 			RETV(CCLASS, 's');
806 			break;
807 		case CHR('S'):
808 			NOTE(REG_ULOCALE);
809 			RETV(CCLASS, 'S');
810 			break;
811 		case CHR('t'):
812 			RETV(PLAIN, CHR('\t'));
813 			break;
814 		case CHR('u'):
815 			c = lexdigits(v, 16, 4, 4);
816 			if (ISERR() || !CHR_IS_IN_RANGE(c))
817 				FAILW(REG_EESCAPE);
818 			RETV(PLAIN, c);
819 			break;
820 		case CHR('U'):
821 			c = lexdigits(v, 16, 8, 8);
822 			if (ISERR() || !CHR_IS_IN_RANGE(c))
823 				FAILW(REG_EESCAPE);
824 			RETV(PLAIN, c);
825 			break;
826 		case CHR('v'):
827 			RETV(PLAIN, CHR('\v'));
828 			break;
829 		case CHR('w'):
830 			NOTE(REG_ULOCALE);
831 			RETV(CCLASS, 'w');
832 			break;
833 		case CHR('W'):
834 			NOTE(REG_ULOCALE);
835 			RETV(CCLASS, 'W');
836 			break;
837 		case CHR('x'):
838 			NOTE(REG_UUNPORT);
839 			c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
840 			if (ISERR() || !CHR_IS_IN_RANGE(c))
841 				FAILW(REG_EESCAPE);
842 			RETV(PLAIN, c);
843 			break;
844 		case CHR('y'):
845 			NOTE(REG_ULOCALE);
846 			RETV(WBDRY, 0);
847 			break;
848 		case CHR('Y'):
849 			NOTE(REG_ULOCALE);
850 			RETV(NWBDRY, 0);
851 			break;
852 		case CHR('Z'):
853 			RETV(SEND, 0);
854 			break;
855 		case CHR('1'):
856 		case CHR('2'):
857 		case CHR('3'):
858 		case CHR('4'):
859 		case CHR('5'):
860 		case CHR('6'):
861 		case CHR('7'):
862 		case CHR('8'):
863 		case CHR('9'):
864 			save = v->now;
865 			v->now--;			/* put first digit back */
866 			c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
867 			if (ISERR())
868 				FAILW(REG_EESCAPE);
869 			/* ugly heuristic (first test is "exactly 1 digit?") */
870 			if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
871 			{
872 				NOTE(REG_UBACKREF);
873 				RETV(BACKREF, c);
874 			}
875 			/* oops, doesn't look like it's a backref after all... */
876 			v->now = save;
877 			/* and fall through into octal number */
878 			/* FALLTHROUGH */
879 		case CHR('0'):
880 			NOTE(REG_UUNPORT);
881 			v->now--;			/* put first digit back */
882 			c = lexdigits(v, 8, 1, 3);
883 			if (ISERR())
884 				FAILW(REG_EESCAPE);
885 			if (c > 0xff)
886 			{
887 				/* out of range, so we handled one digit too much */
888 				v->now--;
889 				c >>= 3;
890 			}
891 			RETV(PLAIN, c);
892 			break;
893 		default:
894 			assert(iscalpha(c));
895 			FAILW(REG_EESCAPE); /* unknown alphabetic escape */
896 			break;
897 	}
898 	assert(NOTREACHED);
899 }
900 
901 /*
902  * lexdigits - slurp up digits and return chr value
903  *
904  * This does not account for overflow; callers should range-check the result
905  * if maxlen is large enough to make that possible.
906  */
907 static chr						/* chr value; errors signalled via ERR */
lexdigits(struct vars * v,int base,int minlen,int maxlen)908 lexdigits(struct vars *v,
909 		  int base,
910 		  int minlen,
911 		  int maxlen)
912 {
913 	uchr		n;				/* unsigned to avoid overflow misbehavior */
914 	int			len;
915 	chr			c;
916 	int			d;
917 	const uchr	ub = (uchr) base;
918 
919 	n = 0;
920 	for (len = 0; len < maxlen && !ATEOS(); len++)
921 	{
922 		c = *v->now++;
923 		switch (c)
924 		{
925 			case CHR('0'):
926 			case CHR('1'):
927 			case CHR('2'):
928 			case CHR('3'):
929 			case CHR('4'):
930 			case CHR('5'):
931 			case CHR('6'):
932 			case CHR('7'):
933 			case CHR('8'):
934 			case CHR('9'):
935 				d = DIGITVAL(c);
936 				break;
937 			case CHR('a'):
938 			case CHR('A'):
939 				d = 10;
940 				break;
941 			case CHR('b'):
942 			case CHR('B'):
943 				d = 11;
944 				break;
945 			case CHR('c'):
946 			case CHR('C'):
947 				d = 12;
948 				break;
949 			case CHR('d'):
950 			case CHR('D'):
951 				d = 13;
952 				break;
953 			case CHR('e'):
954 			case CHR('E'):
955 				d = 14;
956 				break;
957 			case CHR('f'):
958 			case CHR('F'):
959 				d = 15;
960 				break;
961 			default:
962 				v->now--;		/* oops, not a digit at all */
963 				d = -1;
964 				break;
965 		}
966 
967 		if (d >= base)
968 		{						/* not a plausible digit */
969 			v->now--;
970 			d = -1;
971 		}
972 		if (d < 0)
973 			break;				/* NOTE BREAK OUT */
974 		n = n * ub + (uchr) d;
975 	}
976 	if (len < minlen)
977 		ERR(REG_EESCAPE);
978 
979 	return (chr) n;
980 }
981 
982 /*
983  * brenext - get next BRE token
984  *
985  * This is much like EREs except for all the stupid backslashes and the
986  * context-dependency of some things.
987  */
988 static int						/* 1 normal, 0 failure */
brenext(struct vars * v,chr c)989 brenext(struct vars *v,
990 		chr c)
991 {
992 	switch (c)
993 	{
994 		case CHR('*'):
995 			if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
996 				RETV(PLAIN, c);
997 			RETV('*', 1);
998 			break;
999 		case CHR('['):
1000 			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
1001 				*(v->now + 1) == CHR(':') &&
1002 				(*(v->now + 2) == CHR('<') ||
1003 				 *(v->now + 2) == CHR('>')) &&
1004 				*(v->now + 3) == CHR(':') &&
1005 				*(v->now + 4) == CHR(']') &&
1006 				*(v->now + 5) == CHR(']'))
1007 			{
1008 				c = *(v->now + 2);
1009 				v->now += 6;
1010 				NOTE(REG_UNONPOSIX);
1011 				RET((c == CHR('<')) ? '<' : '>');
1012 			}
1013 			INTOCON(L_BRACK);
1014 			if (NEXT1('^'))
1015 			{
1016 				v->now++;
1017 				RETV('[', 0);
1018 			}
1019 			RETV('[', 1);
1020 			break;
1021 		case CHR('.'):
1022 			RET('.');
1023 			break;
1024 		case CHR('^'):
1025 			if (LASTTYPE(EMPTY))
1026 				RET('^');
1027 			if (LASTTYPE('('))
1028 			{
1029 				NOTE(REG_UUNSPEC);
1030 				RET('^');
1031 			}
1032 			RETV(PLAIN, c);
1033 			break;
1034 		case CHR('$'):
1035 			if (v->cflags & REG_EXPANDED)
1036 				skip(v);
1037 			if (ATEOS())
1038 				RET('$');
1039 			if (NEXT2('\\', ')'))
1040 			{
1041 				NOTE(REG_UUNSPEC);
1042 				RET('$');
1043 			}
1044 			RETV(PLAIN, c);
1045 			break;
1046 		case CHR('\\'):
1047 			break;				/* see below */
1048 		default:
1049 			RETV(PLAIN, c);
1050 			break;
1051 	}
1052 
1053 	assert(c == CHR('\\'));
1054 
1055 	if (ATEOS())
1056 		FAILW(REG_EESCAPE);
1057 
1058 	c = *v->now++;
1059 	switch (c)
1060 	{
1061 		case CHR('{'):
1062 			INTOCON(L_BBND);
1063 			NOTE(REG_UBOUNDS);
1064 			RET('{');
1065 			break;
1066 		case CHR('('):
1067 			RETV('(', 1);
1068 			break;
1069 		case CHR(')'):
1070 			RETV(')', c);
1071 			break;
1072 		case CHR('<'):
1073 			NOTE(REG_UNONPOSIX);
1074 			RET('<');
1075 			break;
1076 		case CHR('>'):
1077 			NOTE(REG_UNONPOSIX);
1078 			RET('>');
1079 			break;
1080 		case CHR('1'):
1081 		case CHR('2'):
1082 		case CHR('3'):
1083 		case CHR('4'):
1084 		case CHR('5'):
1085 		case CHR('6'):
1086 		case CHR('7'):
1087 		case CHR('8'):
1088 		case CHR('9'):
1089 			NOTE(REG_UBACKREF);
1090 			RETV(BACKREF, (chr) DIGITVAL(c));
1091 			break;
1092 		default:
1093 			if (iscalnum(c))
1094 			{
1095 				NOTE(REG_UBSALNUM);
1096 				NOTE(REG_UUNSPEC);
1097 			}
1098 			RETV(PLAIN, c);
1099 			break;
1100 	}
1101 
1102 	assert(NOTREACHED);
1103 	return 0;
1104 }
1105 
1106 /*
1107  * skip - skip white space and comments in expanded form
1108  */
1109 static void
skip(struct vars * v)1110 skip(struct vars *v)
1111 {
1112 	const chr  *start = v->now;
1113 
1114 	assert(v->cflags & REG_EXPANDED);
1115 
1116 	for (;;)
1117 	{
1118 		while (!ATEOS() && iscspace(*v->now))
1119 			v->now++;
1120 		if (ATEOS() || *v->now != CHR('#'))
1121 			break;				/* NOTE BREAK OUT */
1122 		assert(NEXT1('#'));
1123 		while (!ATEOS() && *v->now != CHR('\n'))
1124 			v->now++;
1125 		/* leave the newline to be picked up by the iscspace loop */
1126 	}
1127 
1128 	if (v->now != start)
1129 		NOTE(REG_UNONPOSIX);
1130 }
1131 
1132 /*
1133  * newline - return the chr for a newline
1134  *
1135  * This helps confine use of CHR to this source file.
1136  */
1137 static chr
newline(void)1138 newline(void)
1139 {
1140 	return CHR('\n');
1141 }
1142 
1143 /*
1144  * chrnamed - return the chr known by a given (chr string) name
1145  *
1146  * The code is a bit clumsy, but this routine gets only such specialized
1147  * use that it hardly matters.
1148  */
1149 static chr
chrnamed(struct vars * v,const chr * startp,const chr * endp,chr lastresort)1150 chrnamed(struct vars *v,
1151 		 const chr *startp,		/* start of name */
1152 		 const chr *endp,		/* just past end of name */
1153 		 chr lastresort)		/* what to return if name lookup fails */
1154 {
1155 	chr			c;
1156 	int			errsave;
1157 	int			e;
1158 	struct cvec *cv;
1159 
1160 	errsave = v->err;
1161 	v->err = 0;
1162 	c = element(v, startp, endp);
1163 	e = v->err;
1164 	v->err = errsave;
1165 
1166 	if (e != 0)
1167 		return lastresort;
1168 
1169 	cv = range(v, c, c, 0);
1170 	if (cv->nchrs == 0)
1171 		return lastresort;
1172 	return cv->chrs[0];
1173 }
1174