1 #include <sys/types.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <limits.h>
6 #include <ctype.h>
7 #include <assert.h>
8 
9 #include "regex.h"
10 
11 /* utility definitions */
12 #ifdef _POSIX2_RE_DUP_MAX
13 #define	DUPMAX	_POSIX2_RE_DUP_MAX
14 #else
15 #define	DUPMAX	255
16 #endif
17 #define	INFINITY	(DUPMAX + 1)
18 #define	NC		(CHAR_MAX - CHAR_MIN + 1)
19 typedef unsigned char uch;
20 
21 /* for old systems with bcopy() but no memmove() */
22 #ifdef USEBCOPY
23 #define	memmove(d, s, c)	bcopy(s, d, c)
24 #endif
25 
26 #define	MAGIC1	((('r'^0200)<<8) | 'e')
27 
28 /*
29  * The internal representation is a *strip*, a sequence of
30  * operators ending with an endmarker.  (Some terminology etc. is a
31  * historical relic of earlier versions which used multiple strips.)
32  * Certain oddities in the representation are there to permit running
33  * the machinery backwards; in particular, any deviation from sequential
34  * flow must be marked at both its source and its destination.  Some
35  * fine points:
36  *
37  * - OPLUS_ and O_PLUS are *inside* the loop they create.
38  * - OQUEST_ and O_QUEST are *outside* the bypass they create.
39  * - OCH_ and O_CH are *outside* the multi-way branch they create, while
40  *   OOR1 and OOR2 are respectively the end and the beginning of one of
41  *   the branches.  Note that there is an implicit OOR2 following OCH_
42  *   and an implicit OOR1 preceding O_CH.
43  *
44  * In state representations, an operator's bit is on to signify a state
45  * immediately *preceding* "execution" of that operator.
46  */
47 typedef long sop;		/* strip operator */
48 typedef long sopno;
49 #define	OPRMASK	0x7c000000
50 #define	OPDMASK	0x03ffffff
51 #define	OPSHIFT	(26)
52 #define	OP(n)	((n)&OPRMASK)
53 #define	OPND(n)	((n)&OPDMASK)
54 #define	SOP(op, opnd)	((op)|(opnd))
55 /* operators			   meaning	operand			*/
56 /*						(back, fwd are offsets)	*/
57 #define	OEND	(1<<OPSHIFT)	/* endmarker	-			*/
58 #define	OCHAR	(2<<OPSHIFT)	/* character	unsigned char		*/
59 #define	OBOL	(3<<OPSHIFT)	/* left anchor	-			*/
60 #define	OEOL	(4<<OPSHIFT)	/* right anchor	-			*/
61 #define	OANY	(5<<OPSHIFT)	/* .		-			*/
62 #define	OANYOF	(6<<OPSHIFT)	/* [...]	set number		*/
63 #define	OBACK_	(7<<OPSHIFT)	/* begin \d	paren number		*/
64 #define	O_BACK	(8<<OPSHIFT)	/* end \d	paren number		*/
65 #define	OPLUS_	(9<<OPSHIFT)	/* + prefix	fwd to suffix		*/
66 #define	O_PLUS	(10<<OPSHIFT)	/* + suffix	back to prefix		*/
67 #define	OQUEST_	(11<<OPSHIFT)	/* ? prefix	fwd to suffix		*/
68 #define	O_QUEST	(12<<OPSHIFT)	/* ? suffix	back to prefix		*/
69 #define	OLPAREN	(13<<OPSHIFT)	/* (		fwd to )		*/
70 #define	ORPAREN	(14<<OPSHIFT)	/* )		back to (		*/
71 #define	OCH_	(15<<OPSHIFT)	/* begin choice	fwd to OOR2		*/
72 #define	OOR1	(16<<OPSHIFT)	/* | pt. 1	back to OOR1 or OCH_	*/
73 #define	OOR2	(17<<OPSHIFT)	/* | pt. 2	fwd to OOR2 or O_CH	*/
74 #define	O_CH	(18<<OPSHIFT)	/* end choice	back to OOR1		*/
75 #define	OBOW	(19<<OPSHIFT)	/* begin word	-			*/
76 #define	OEOW	(20<<OPSHIFT)	/* end word	-			*/
77 
78 /*
79  * Structure for [] character-set representation.  Character sets are
80  * done as bit vectors, grouped 8 to a byte vector for compactness.
81  * The individual set therefore has both a pointer to the byte vector
82  * and a mask to pick out the relevant bit of each byte.  A hash code
83  * simplifies testing whether two sets could be identical.
84  *
85  * This will get trickier for multicharacter collating elements.  As
86  * preliminary hooks for dealing with such things, we also carry along
87  * a string of multi-character elements, and decide the size of the
88  * vectors at run time.
89  */
90 typedef struct {
91 	uch *ptr;		/* -> uch [csetsize] */
92 	uch mask;		/* bit within array */
93 	uch hash;		/* hash code */
94 	size_t smultis;
95 	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
96 } cset;
97 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
98 #define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
99 #define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
100 #define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
101 #define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* regcomp() internal fns */
102 
103 /* stuff for character categories */
104 typedef unsigned char cat_t;
105 
106 /*
107  * main compiled-expression structure
108  */
109 struct re_guts {
110 	int magic;
111 #		define	MAGIC2	((('R'^0200)<<8)|'E')
112 	sop *strip;		/* malloced area for strip */
113 	int csetsize;		/* number of bits in a cset vector */
114 	int ncsets;		/* number of csets in use */
115 	cset *sets;		/* -> cset [ncsets] */
116 	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
117 	int cflags;		/* copy of regcomp() cflags argument */
118 	sopno nstates;		/* = number of sops */
119 	sopno firststate;	/* the initial OEND (normally 0) */
120 	sopno laststate;	/* the final OEND */
121 	int iflags;		/* internal flags */
122 #		define	USEBOL	01	/* used ^ */
123 #		define	USEEOL	02	/* used $ */
124 #		define	BAD	04	/* something wrong */
125 	int nbol;		/* number of ^ used */
126 	int neol;		/* number of $ used */
127 	int ncategories;	/* how many character categories */
128 	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
129 	char *must;		/* match must contain this string */
130 	int mlen;		/* length of must */
131 	size_t nsub;		/* copy of re_nsub */
132 	int backrefs;		/* does it use back references? */
133 	sopno nplus;		/* how deep does it nest +s? */
134 	/* catspace must be last */
135 	cat_t catspace[1];	/* actually [NC] */
136 };
137 
138 /* misc utilities */
139 #define	OUT	(CHAR_MAX+1)	/* a non-character value */
140 #define	ISWORD(c)	(isalnum(c) || (c) == '_')
141 
142 
143 /* character-class table */
144 static struct cclass {
145 	char *name;
146 	char *chars;
147 	char *multis;
148 } cclasses[] = {
149 	{"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",""},
150 	{"alpha",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",	""},
151 	{"blank"," \t",	""},
152 	{"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177",""},
153 	{"digit","0123456789",""},
154 	{"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",""},
155 	{"lower","abcdefghijklmnopqrstuvwxyz",""},
156 	{"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",""},
157 	{"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",""},
158 	{"space","\t\n\v\f\r ",	""},
159 	{"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ",""},
160 	{"xdigit","0123456789ABCDEFabcdef",""},
161 	{NULL,NULL,""}
162 };
163 
164 /* character-name table */
165 static struct cname {
166 	char *name;
167 	char code;
168 } cnames[] = {
169 	{"NUL",	'\0'},
170 	{"SOH",	'\001'},
171 	{"STX",	'\002'},
172 	{"ETX",	'\003'},
173 	{"EOT",	'\004'},
174 	{"ENQ",	'\005'},
175 	{"ACK",	'\006'},
176 	{"BEL",	'\007'},
177 	{"alert",	'\007'},
178 	{"BS",		'\010'},
179 	{"backspace",	'\b'},
180 	{"HT",		'\011'},
181 	{"tab",		'\t'},
182 	{"LF",		'\012'},
183 	{"newline",	'\n'},
184 	{"VT",		'\013'},
185 	{"vertical-tab",	'\v'},
186 	{"FF",		'\014'},
187 	{"form-feed",	'\f'},
188 	{"CR",		'\015'},
189 	{"carriage-return",	'\r'},
190 	{"SO",	'\016'},
191 	{"SI",	'\017'},
192 	{"DLE",	'\020'},
193 	{"DC1",	'\021'},
194 	{"DC2",	'\022'},
195 	{"DC3",	'\023'},
196 	{"DC4",	'\024'},
197 	{"NAK",	'\025'},
198 	{"SYN",	'\026'},
199 	{"ETB",	'\027'},
200 	{"CAN",	'\030'},
201 	{"EM",	'\031'},
202 	{"SUB",	'\032'},
203 	{"ESC",	'\033'},
204 	{"IS4",	'\034'},
205 	{"FS",	'\034'},
206 	{"IS3",	'\035'},
207 	{"GS",	'\035'},
208 	{"IS2",	'\036'},
209 	{"RS",	'\036'},
210 	{"IS1",	'\037'},
211 	{"US",	'\037'},
212 	{"space",		' '},
213 	{"exclamation-mark",	'!'},
214 	{"quotation-mark",	'"'},
215 	{"number-sign",		'#'},
216 	{"dollar-sign",		'$'},
217 	{"percent-sign",		'%'},
218 	{"ampersand",		'&'},
219 	{"apostrophe",		'\''},
220 	{"left-parenthesis",	'('},
221 	{"right-parenthesis",	')'},
222 	{"asterisk",	'*'},
223 	{"plus-sign",	'+'},
224 	{"comma",	','},
225 	{"hyphen",	'-'},
226 	{"hyphen-minus",	'-'},
227 	{"period",	'.'},
228 	{"full-stop",	'.'},
229 	{"slash",	'/'},
230 	{"solidus",	'/'},
231 	{"zero",		'0'},
232 	{"one",		'1'},
233 	{"two",		'2'},
234 	{"three",	'3'},
235 	{"four",		'4'},
236 	{"five",		'5'},
237 	{"six",		'6'},
238 	{"seven",	'7'},
239 	{"eight",	'8'},
240 	{"nine",		'9'},
241 	{"colon",	':'},
242 	{"semicolon",	';'},
243 	{"less-than-sign",	'<'},
244 	{"equals-sign",		'='},
245 	{"greater-than-sign",	'>'},
246 	{"question-mark",	'?'},
247 	{"commercial-at",	'@'},
248 	{"left-square-bracket",	'['},
249 	{"backslash",		'\\'},
250 	{"reverse-solidus",	'\\'},
251 	{"right-square-bracket",	']'},
252 	{"circumflex",		'^'},
253 	{"circumflex-accent",	'^'},
254 	{"underscore",		'_'},
255 	{"low-line",		'_'},
256 	{"grave-accent",		'`'},
257 	{"left-brace",		'{'},
258 	{"left-curly-bracket",	'{'},
259 	{"vertical-line",	'|'},
260 	{"right-brace",		'}'},
261 	{"right-curly-bracket",	'}'},
262 	{"tilde",		'~'},
263 	{"DEL",	'\177'},
264 	{NULL,	0}
265 };
266 
267 
268 /*
269  * parse structure, passed up and down to avoid global variables and
270  * other clumsinesses
271  */
272 struct parse {
273 	char *next;		/* next character in RE */
274 	char *end;		/* end of string (-> NUL normally) */
275 	int error;		/* has an error been seen? */
276 	sop *strip;		/* malloced strip */
277 	sopno ssize;		/* malloced strip size (allocated) */
278 	sopno slen;		/* malloced strip length (used) */
279 	int ncsalloc;		/* number of csets allocated */
280 	struct re_guts *g;
281 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
282 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
283 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
284 };
285 
286 #ifdef __cplusplus
287 extern "C" {
288 #endif
289 
290 static void p_ere(register struct parse *p, int stop);
291 static void p_ere_exp(register struct parse *p);
292 static void p_str(register struct parse *p);
293 static void p_bre(register struct parse *p, register int end1, register int end2);
294 static int p_simp_re(register struct parse *p, int starordinary);
295 static int p_count(register struct parse *p);
296 static void p_bracket(register struct parse *p);
297 static void p_b_term(register struct parse *p, register cset *cs);
298 static void p_b_cclass(register struct parse *p, register cset *cs);
299 static void p_b_eclass(register struct parse *p, register cset *cs);
300 static char p_b_symbol(register struct parse *p);
301 static char p_b_coll_elem(register struct parse *p, int endc);
302 static char othercase(int ch);
303 static void bothcases(register struct parse *p, int ch);
304 static void ordinary(register struct parse *p, register int ch);
305 static void nonnewline(register struct parse *p);
306 static void repeat(register struct parse *p, sopno start, int from, int to);
307 static int seterr(register struct parse *p, int e);
308 static cset *allocset(register struct parse *p);
309 static void freeset(register struct parse *p, register cset *cs);
310 static int freezeset(register struct parse *p, register cset *cs);
311 static int firstch(register struct parse *p, register cset *cs);
312 static int nch(register struct parse *p, register cset *cs);
313 static void mcadd(register struct parse *p, register cset *cs, register char *cp);
314 static void mcinvert(register struct parse *p, register cset *cs);
315 static void mccase(register struct parse *p, register cset *cs);
316 static int isinsets(register struct re_guts *g, int c);
317 static int samesets(register struct re_guts *g, int c1, int c2);
318 static void categorize(struct parse *p, register struct re_guts *g);
319 static sopno dupl(register struct parse *p, sopno start, sopno finish);
320 static void doemit(register struct parse *p, sop op, size_t opnd);
321 static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
322 static void dofwd(register struct parse *p, sopno pos, sop value);
323 static void enlarge(register struct parse *p, sopno size);
324 static void stripsnug(register struct parse *p, register struct re_guts *g);
325 static void findmust(register struct parse *p, register struct re_guts *g);
326 static sopno pluscount(register struct parse *p, register struct re_guts *g);
327 
328 #ifdef __cplusplus
329 }
330 #endif
331 
332 static char nuls[10];		/* place to point scanner in event of error */
333 
334 /*
335  * macros for use with parse structure
336  * BEWARE:  these know that the parse structure is named `p' !!!
337  */
338 #define	PEEK()	(*p->next)
339 #define	PEEK2()	(*(p->next+1))
340 #define	MORE()	(p->next < p->end)
341 #define	MORE2()	(p->next+1 < p->end)
342 #define	SEE(c)	(MORE() && PEEK() == (c))
343 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
344 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
345 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
346 #define	NEXT()	(p->next++)
347 #define	NEXT2()	(p->next += 2)
348 #define	NEXTn(n)	(p->next += (n))
349 #define	GETNEXT()	(*p->next++)
350 #define	SETERROR(e)	seterr(p, (e))
351 #define	REQUIRE(co, e)	((co) || SETERROR(e))
352 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
353 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
354 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
355 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
356 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
357 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
358 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
359 #define	HERE()		(p->slen)
360 #define	THERE()		(p->slen - 1)
361 #define	THERETHERE()	(p->slen - 2)
362 #define	DROP(n)	(p->slen -= (n))
363 
364 #define	never	0		/* some <assert.h>s have bugs too */
365 
366 int				/* 0 success, otherwise REG_something */
regcomp(preg,pattern,cflags)367 regcomp(preg, pattern, cflags)
368 regex_t *preg;
369 const char *pattern;
370 int cflags;
371 {
372 	struct parse pa;
373 	register struct re_guts *g;
374 	register struct parse *p = &pa;
375 	register int i;
376 	register size_t len;
377 #define	GOODFLAGS(f)	((f)&~REG_DUMP)
378 
379 	cflags = GOODFLAGS(cflags);
380 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
381 		return(REG_INVARG);
382 
383 	if (cflags&REG_PEND) {
384 		if (preg->re_endp < pattern)
385 			return(REG_INVARG);
386 		len = preg->re_endp - pattern;
387 	} else
388 		len = strlen((char *)pattern);
389 
390 	/* do the mallocs early so failure handling is easy */
391 	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
392 							(NC-1)*sizeof(cat_t));
393 	if (g == NULL)
394 		return(REG_ESPACE);
395 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
396 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
397 	p->slen = 0;
398 	if (p->strip == NULL) {
399 		free((char *)g);
400 		return(REG_ESPACE);
401 	}
402 
403 	/* set things up */
404 	p->g = g;
405 	p->next = (char *)pattern;	/* convenience; we do not modify it */
406 	p->end = p->next + len;
407 	p->error = 0;
408 	p->ncsalloc = 0;
409 	for (i = 0; i < NPAREN; i++) {
410 		p->pbegin[i] = 0;
411 		p->pend[i] = 0;
412 	}
413 	g->csetsize = NC;
414 	g->sets = NULL;
415 	g->setbits = NULL;
416 	g->ncsets = 0;
417 	g->cflags = cflags;
418 	g->iflags = 0;
419 	g->nbol = 0;
420 	g->neol = 0;
421 	g->must = NULL;
422 	g->mlen = 0;
423 	g->nsub = 0;
424 	g->ncategories = 1;	/* category 0 is "everything else" */
425 	g->categories = &g->catspace[-(CHAR_MIN)];
426 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
427 	g->backrefs = 0;
428 
429 	/* do it */
430 	EMIT(OEND, 0);
431 	g->firststate = THERE();
432 	if (cflags&REG_EXTENDED)
433 		p_ere(p, OUT);
434 	else if (cflags&REG_NOSPEC)
435 		p_str(p);
436 	else
437 		p_bre(p, OUT, OUT);
438 	EMIT(OEND, 0);
439 	g->laststate = THERE();
440 
441 	/* tidy up loose ends and fill things in */
442 	categorize(p, g);
443 	stripsnug(p, g);
444 	findmust(p, g);
445 	g->nplus = pluscount(p, g);
446 	g->magic = MAGIC2;
447 	preg->re_nsub = g->nsub;
448 	preg->re_g = g;
449 	preg->re_magic = MAGIC1;
450 	/* not debugging, so can't rely on the assert() in regexec() */
451 	if (g->iflags&BAD)
452 		SETERROR(REG_ASSERT);
453 
454 	/* win or lose, we're done */
455 	if (p->error != 0)	/* lose */
456 		regfree(preg);
457 	return(p->error);
458 #undef GOODFLAGS
459 }
460 
461 /*
462  - p_ere - ERE parser top level, concatenation and alternation
463  == static void p_ere(register struct parse *p, int stop);
464  */
465 static void
p_ere(p,stop)466 p_ere(p, stop)
467 register struct parse *p;
468 int stop;			/* character this ERE should end at */
469 {
470 	register char c;
471 	register sopno prevback;
472 	register sopno prevfwd;
473 	register sopno conc;
474 	register int first = 1;		/* is this the first alternative? */
475 
476 	for (;;) {
477 		/* do a bunch of concatenated expressions */
478 		conc = HERE();
479 		while (MORE() && (c = PEEK()) != '|' && c != stop)
480 			p_ere_exp(p);
481 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
482 
483 		if (!EAT('|'))
484 			break;		/* NOTE BREAK OUT */
485 
486 		if (first) {
487 			INSERT(OCH_, conc);	/* offset is wrong */
488 			prevfwd = conc;
489 			prevback = conc;
490 			first = 0;
491 		}
492 		ASTERN(OOR1, prevback);
493 		prevback = THERE();
494 		AHEAD(prevfwd);			/* fix previous offset */
495 		prevfwd = HERE();
496 		EMIT(OOR2, 0);			/* offset is very wrong */
497 	}
498 
499 	if (!first) {		/* tail-end fixups */
500 		AHEAD(prevfwd);
501 		ASTERN(O_CH, prevback);
502 	}
503 
504 	assert(!MORE() || SEE(stop));
505 }
506 
507 /*
508  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
509  == static void p_ere_exp(register struct parse *p);
510  */
511 static void
p_ere_exp(p)512 p_ere_exp(p)
513 register struct parse *p;
514 {
515 	register char c;
516 	register sopno pos;
517 	register int count;
518 	register int count2;
519 	register sopno subno;
520 	int wascaret = 0;
521 
522 	assert(MORE());		/* caller should have ensured this */
523 	c = GETNEXT();
524 
525 	pos = HERE();
526 	switch (c) {
527 	case '(':
528 		REQUIRE(MORE(), REG_EPAREN);
529 		p->g->nsub++;
530 		subno = p->g->nsub;
531 		if (subno < NPAREN)
532 			p->pbegin[subno] = HERE();
533 		EMIT(OLPAREN, subno);
534 		if (!SEE(')'))
535 			p_ere(p, ')');
536 		if (subno < NPAREN) {
537 			p->pend[subno] = HERE();
538 			assert(p->pend[subno] != 0);
539 		}
540 		EMIT(ORPAREN, subno);
541 		MUSTEAT(')', REG_EPAREN);
542 		break;
543 	case '^':
544 		EMIT(OBOL, 0);
545 		p->g->iflags |= USEBOL;
546 		p->g->nbol++;
547 		wascaret = 1;
548 		break;
549 	case '$':
550 		EMIT(OEOL, 0);
551 		p->g->iflags |= USEEOL;
552 		p->g->neol++;
553 		break;
554 	case '|':
555 		SETERROR(REG_EMPTY);
556 		break;
557 	case '*':
558 	case '+':
559 	case '?':
560 		SETERROR(REG_BADRPT);
561 		break;
562 	case '.':
563 		if (p->g->cflags&REG_NEWLINE)
564 			nonnewline(p);
565 		else
566 			EMIT(OANY, 0);
567 		break;
568 	case '[':
569 		p_bracket(p);
570 		break;
571 	case '\\':
572 		REQUIRE(MORE(), REG_EESCAPE);
573 		c = GETNEXT();
574 		ordinary(p, c);
575 		break;
576 	case '{':		/* okay as ordinary except if digit follows */
577 		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
578 		/* FALLTHROUGH */
579 	default:
580 		ordinary(p, c);
581 		break;
582 	}
583 
584 	if (!MORE())
585 		return;
586 	c = PEEK();
587 	/* we call { a repetition if followed by a digit */
588 	if (!( c == '*' || c == '+' || c == '?' ||
589 				(c == '{' && MORE2() && isdigit(PEEK2())) ))
590 		return;		/* no repetition, we're done */
591 	NEXT();
592 
593 	REQUIRE(!wascaret, REG_BADRPT);
594 	switch (c) {
595 	case '*':	/* implemented as +? */
596 		/* this case does not require the (y|) trick, noKLUDGE */
597 		INSERT(OPLUS_, pos);
598 		ASTERN(O_PLUS, pos);
599 		INSERT(OQUEST_, pos);
600 		ASTERN(O_QUEST, pos);
601 		break;
602 	case '+':
603 		INSERT(OPLUS_, pos);
604 		ASTERN(O_PLUS, pos);
605 		break;
606 	case '?':
607 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
608 		INSERT(OCH_, pos);		/* offset slightly wrong */
609 		ASTERN(OOR1, pos);		/* this one's right */
610 		AHEAD(pos);			/* fix the OCH_ */
611 		EMIT(OOR2, 0);			/* offset very wrong... */
612 		AHEAD(THERE());			/* ...so fix it */
613 		ASTERN(O_CH, THERETHERE());
614 		break;
615 	case '{':
616 		count = p_count(p);
617 		if (EAT(',')) {
618 			if (isdigit(PEEK())) {
619 				count2 = p_count(p);
620 				REQUIRE(count <= count2, REG_BADBR);
621 			} else		/* single number with comma */
622 				count2 = INFINITY;
623 		} else		/* just a single number */
624 			count2 = count;
625 		repeat(p, pos, count, count2);
626 		if (!EAT('}')) {	/* error heuristics */
627 			while (MORE() && PEEK() != '}')
628 				NEXT();
629 			REQUIRE(MORE(), REG_EBRACE);
630 			SETERROR(REG_BADBR);
631 		}
632 		break;
633 	}
634 
635 	if (!MORE())
636 		return;
637 	c = PEEK();
638 	if (!( c == '*' || c == '+' || c == '?' ||
639 				(c == '{' && MORE2() && isdigit(PEEK2())) ) )
640 		return;
641 	SETERROR(REG_BADRPT);
642 }
643 
644 /*
645  - p_str - string (no metacharacters) "parser"
646  == static void p_str(register struct parse *p);
647  */
648 static void
p_str(p)649 p_str(p)
650 register struct parse *p;
651 {
652 	REQUIRE(MORE(), REG_EMPTY);
653 	while (MORE())
654 		ordinary(p, GETNEXT());
655 }
656 
657 /*
658  - p_bre - BRE parser top level, anchoring and concatenation
659  == static void p_bre(register struct parse *p, register int end1, \
660  ==	register int end2);
661  * Giving end1 as OUT essentially eliminates the end1/end2 check.
662  *
663  * This implementation is a bit of a kludge, in that a trailing $ is first
664  * taken as an ordinary character and then revised to be an anchor.  The
665  * only undesirable side effect is that '$' gets included as a character
666  * category in such cases.  This is fairly harmless; not worth fixing.
667  * The amount of lookahead needed to avoid this kludge is excessive.
668  */
669 static void
p_bre(p,end1,end2)670 p_bre(p, end1, end2)
671 register struct parse *p;
672 register int end1;		/* first terminating character */
673 register int end2;		/* second terminating character */
674 {
675 	register sopno start = HERE();
676 	register int first = 1;			/* first subexpression? */
677 	register int wasdollar = 0;
678 
679 	if (EAT('^')) {
680 		EMIT(OBOL, 0);
681 		p->g->iflags |= USEBOL;
682 		p->g->nbol++;
683 	}
684 	while (MORE() && !SEETWO(end1, end2)) {
685 		wasdollar = p_simp_re(p, first);
686 		first = 0;
687 	}
688 	if (wasdollar) {	/* oops, that was a trailing anchor */
689 		DROP(1);
690 		EMIT(OEOL, 0);
691 		p->g->iflags |= USEEOL;
692 		p->g->neol++;
693 	}
694 
695 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
696 }
697 
698 /*
699  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
700  == static int p_simp_re(register struct parse *p, int starordinary);
701  */
702 static int			/* was the simple RE an unbackslashed $? */
p_simp_re(p,starordinary)703 p_simp_re(p, starordinary)
704 register struct parse *p;
705 int starordinary;		/* is a leading * an ordinary character? */
706 {
707 	register int c;
708 	register int count;
709 	register int count2;
710 	register sopno pos;
711 	register int i;
712 	register sopno subno;
713 #	define	BACKSL	(1<<CHAR_BIT)
714 
715 	pos = HERE();		/* repetion op, if any, covers from here */
716 
717 	assert(MORE());		/* caller should have ensured this */
718 	c = GETNEXT();
719 	if (c == '\\') {
720 		REQUIRE(MORE(), REG_EESCAPE);
721 		c = BACKSL | (unsigned char)GETNEXT();
722 	}
723 	switch (c) {
724 	case '.':
725 		if (p->g->cflags&REG_NEWLINE)
726 			nonnewline(p);
727 		else
728 			EMIT(OANY, 0);
729 		break;
730 	case '[':
731 		p_bracket(p);
732 		break;
733 	case BACKSL|'{':
734 		SETERROR(REG_BADRPT);
735 		break;
736 	case BACKSL|'(':
737 		p->g->nsub++;
738 		subno = p->g->nsub;
739 		if (subno < NPAREN)
740 			p->pbegin[subno] = HERE();
741 		EMIT(OLPAREN, subno);
742 		/* the MORE here is an error heuristic */
743 		if (MORE() && !SEETWO('\\', ')'))
744 			p_bre(p, '\\', ')');
745 		if (subno < NPAREN) {
746 			p->pend[subno] = HERE();
747 			assert(p->pend[subno] != 0);
748 		}
749 		EMIT(ORPAREN, subno);
750 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
751 		break;
752 	case BACKSL|')':	/* should not get here -- must be user */
753 	case BACKSL|'}':
754 		SETERROR(REG_EPAREN);
755 		break;
756 	case BACKSL|'1':
757 	case BACKSL|'2':
758 	case BACKSL|'3':
759 	case BACKSL|'4':
760 	case BACKSL|'5':
761 	case BACKSL|'6':
762 	case BACKSL|'7':
763 	case BACKSL|'8':
764 	case BACKSL|'9':
765 		i = (c&~BACKSL) - '0';
766 		assert(i < NPAREN);
767 		if (p->pend[i] != 0) {
768 			assert(i <= p->g->nsub);
769 			EMIT(OBACK_, i);
770 			assert(p->pbegin[i] != 0);
771 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
772 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
773 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
774 			EMIT(O_BACK, i);
775 		} else
776 			SETERROR(REG_ESUBREG);
777 		p->g->backrefs = 1;
778 		break;
779 	case '*':
780 		REQUIRE(starordinary, REG_BADRPT);
781 		/* FALLTHROUGH */
782 	default:
783 		ordinary(p, (char)c);	/* takes off BACKSL, if any */
784 		break;
785 	}
786 
787 	if (EAT('*')) {		/* implemented as +? */
788 		/* this case does not require the (y|) trick, noKLUDGE */
789 		INSERT(OPLUS_, pos);
790 		ASTERN(O_PLUS, pos);
791 		INSERT(OQUEST_, pos);
792 		ASTERN(O_QUEST, pos);
793 	} else if (EATTWO('\\', '{')) {
794 		count = p_count(p);
795 		if (EAT(',')) {
796 			if (MORE() && isdigit(PEEK())) {
797 				count2 = p_count(p);
798 				REQUIRE(count <= count2, REG_BADBR);
799 			} else		/* single number with comma */
800 				count2 = INFINITY;
801 		} else		/* just a single number */
802 			count2 = count;
803 		repeat(p, pos, count, count2);
804 		if (!EATTWO('\\', '}')) {	/* error heuristics */
805 			while (MORE() && !SEETWO('\\', '}'))
806 				NEXT();
807 			REQUIRE(MORE(), REG_EBRACE);
808 			SETERROR(REG_BADBR);
809 		}
810 	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
811 		return(1);
812 
813 	return(0);
814 }
815 
816 /*
817  - p_count - parse a repetition count
818  == static int p_count(register struct parse *p);
819  */
820 static int			/* the value */
p_count(p)821 p_count(p)
822 register struct parse *p;
823 {
824 	register int count = 0;
825 	register int ndigits = 0;
826 
827 	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
828 		count = count*10 + (GETNEXT() - '0');
829 		ndigits++;
830 	}
831 
832 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
833 	return(count);
834 }
835 
836 /*
837  - p_bracket - parse a bracketed character list
838  == static void p_bracket(register struct parse *p);
839  *
840  * Note a significant property of this code:  if the allocset() did SETERROR,
841  * no set operations are done.
842  */
843 static void
p_bracket(p)844 p_bracket(p)
845 register struct parse *p;
846 {
847 	register cset *cs = allocset(p);
848 	register int invert = 0;
849 
850 	/* Dept of Truly Sickening Special-Case Kludges */
851 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
852 		EMIT(OBOW, 0);
853 		NEXTn(6);
854 		return;
855 	}
856 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
857 		EMIT(OEOW, 0);
858 		NEXTn(6);
859 		return;
860 	}
861 
862 	if (EAT('^'))
863 		invert++;	/* make note to invert set at end */
864 	if (EAT(']'))
865 		CHadd(cs, ']');
866 	else if (EAT('-'))
867 		CHadd(cs, '-');
868 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
869 		p_b_term(p, cs);
870 	if (EAT('-'))
871 		CHadd(cs, '-');
872 	MUSTEAT(']', REG_EBRACK);
873 
874 	if (p->error != 0)	/* don't mess things up further */
875 		return;
876 
877 	if (p->g->cflags&REG_ICASE) {
878 		register int i;
879 		register int ci;
880 
881 		for (i = p->g->csetsize - 1; i >= 0; i--)
882 			if (CHIN(cs, i) && isalpha(i)) {
883 				ci = othercase(i);
884 				if (ci != i)
885 					CHadd(cs, ci);
886 			}
887 		if (cs->multis != NULL)
888 			mccase(p, cs);
889 	}
890 	if (invert) {
891 		register int i;
892 
893 		for (i = p->g->csetsize - 1; i >= 0; i--)
894 			if (CHIN(cs, i))
895 				CHsub(cs, i);
896 			else
897 				CHadd(cs, i);
898 		if (p->g->cflags&REG_NEWLINE)
899 			CHsub(cs, '\n');
900 		if (cs->multis != NULL)
901 			mcinvert(p, cs);
902 	}
903 
904 	assert(cs->multis == NULL);		/* xxx */
905 
906 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
907 		ordinary(p, firstch(p, cs));
908 		freeset(p, cs);
909 	} else
910 		EMIT(OANYOF, freezeset(p, cs));
911 }
912 
913 /*
914  - p_b_term - parse one term of a bracketed character list
915  == static void p_b_term(register struct parse *p, register cset *cs);
916  */
917 static void
p_b_term(p,cs)918 p_b_term(p, cs)
919 register struct parse *p;
920 register cset *cs;
921 {
922 	register char c;
923 	register char start, finish;
924 	register int i;
925 
926 	/* classify what we've got */
927 	switch ((MORE()) ? PEEK() : '\0') {
928 	case '[':
929 		c = (MORE2()) ? PEEK2() : '\0';
930 		break;
931 	case '-':
932 		SETERROR(REG_ERANGE);
933 		return;			/* NOTE RETURN */
934 		break;
935 	default:
936 		c = '\0';
937 		break;
938 	}
939 
940 	switch (c) {
941 	case ':':		/* character class */
942 		NEXT2();
943 		REQUIRE(MORE(), REG_EBRACK);
944 		c = PEEK();
945 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
946 		p_b_cclass(p, cs);
947 		REQUIRE(MORE(), REG_EBRACK);
948 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
949 		break;
950 	case '=':		/* equivalence class */
951 		NEXT2();
952 		REQUIRE(MORE(), REG_EBRACK);
953 		c = PEEK();
954 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
955 		p_b_eclass(p, cs);
956 		REQUIRE(MORE(), REG_EBRACK);
957 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
958 		break;
959 	default:		/* symbol, ordinary character, or range */
960 /* xxx revision needed for multichar stuff */
961 		start = p_b_symbol(p);
962 		if (SEE('-') && MORE2() && PEEK2() != ']') {
963 			/* range */
964 			NEXT();
965 			if (EAT('-'))
966 				finish = '-';
967 			else
968 				finish = p_b_symbol(p);
969 		} else
970 			finish = start;
971 /* xxx what about signed chars here... */
972 		REQUIRE(start <= finish, REG_ERANGE);
973 		for (i = start; i <= finish; i++)
974 			CHadd(cs, i);
975 		break;
976 	}
977 }
978 
979 /*
980  - p_b_cclass - parse a character-class name and deal with it
981  == static void p_b_cclass(register struct parse *p, register cset *cs);
982  */
983 static void
p_b_cclass(p,cs)984 p_b_cclass(p, cs)
985 register struct parse *p;
986 register cset *cs;
987 {
988 	register char *sp = p->next;
989 	register struct cclass *cp;
990 	register size_t len;
991 	register char *u;
992 	register char c;
993 
994 	while (MORE() && isalpha(PEEK()))
995 		NEXT();
996 	len = p->next - sp;
997 	for (cp = cclasses; cp->name != NULL; cp++)
998 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
999 			break;
1000 	if (cp->name == NULL) {
1001 		/* oops, didn't find it */
1002 		SETERROR(REG_ECTYPE);
1003 		return;
1004 	}
1005 
1006 	u = cp->chars;
1007 	while ((c = *u++) != '\0')
1008 		CHadd(cs, c);
1009 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
1010 		MCadd(p, cs, u);
1011 }
1012 
1013 /*
1014  - p_b_eclass - parse an equivalence-class name and deal with it
1015  == static void p_b_eclass(register struct parse *p, register cset *cs);
1016  *
1017  * This implementation is incomplete. xxx
1018  */
1019 static void
p_b_eclass(p,cs)1020 p_b_eclass(p, cs)
1021 register struct parse *p;
1022 register cset *cs;
1023 {
1024 	register char c;
1025 
1026 	c = p_b_coll_elem(p, '=');
1027 	CHadd(cs, c);
1028 }
1029 
1030 /*
1031  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1032  == static char p_b_symbol(register struct parse *p);
1033  */
1034 static char			/* value of symbol */
p_b_symbol(p)1035 p_b_symbol(p)
1036 register struct parse *p;
1037 {
1038 	register char value;
1039 
1040 	REQUIRE(MORE(), REG_EBRACK);
1041 	if (!EATTWO('[', '.'))
1042 		return(GETNEXT());
1043 
1044 	/* collating symbol */
1045 	value = p_b_coll_elem(p, '.');
1046 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1047 	return(value);
1048 }
1049 
1050 /*
1051  - p_b_coll_elem - parse a collating-element name and look it up
1052  == static char p_b_coll_elem(register struct parse *p, int endc);
1053  */
1054 static char			/* value of collating element */
p_b_coll_elem(p,endc)1055 p_b_coll_elem(p, endc)
1056 register struct parse *p;
1057 int endc;			/* name ended by endc,']' */
1058 {
1059 	register char *sp = p->next;
1060 	register struct cname *cp;
1061 	register int len;
1062 
1063 	while (MORE() && !SEETWO(endc, ']'))
1064 		NEXT();
1065 	if (!MORE()) {
1066 		SETERROR(REG_EBRACK);
1067 		return(0);
1068 	}
1069 	len = p->next - sp;
1070 	for (cp = cnames; cp->name != NULL; cp++)
1071 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1072 			return(cp->code);	/* known name */
1073 	if (len == 1)
1074 		return(*sp);	/* single character */
1075 	SETERROR(REG_ECOLLATE);			/* neither */
1076 	return(0);
1077 }
1078 
1079 /*
1080  - othercase - return the case counterpart of an alphabetic
1081  == static char othercase(int ch);
1082  */
1083 static char			/* if no counterpart, return ch */
othercase(ch)1084 othercase(ch)
1085 int ch;
1086 {
1087 	assert(isalpha(ch));
1088 	if (isupper(ch))
1089 		return(tolower(ch));
1090 	else if (islower(ch))
1091 		return(toupper(ch));
1092 	else			/* peculiar, but could happen */
1093 		return(ch);
1094 }
1095 
1096 /*
1097  - bothcases - emit a dualcase version of a two-case character
1098  == static void bothcases(register struct parse *p, int ch);
1099  *
1100  * Boy, is this implementation ever a kludge...
1101  */
1102 static void
bothcases(p,ch)1103 bothcases(p, ch)
1104 register struct parse *p;
1105 int ch;
1106 {
1107 	register char *oldnext = p->next;
1108 	register char *oldend = p->end;
1109 	char bracket[3];
1110 
1111 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1112 	p->next = bracket;
1113 	p->end = bracket+2;
1114 	bracket[0] = ch;
1115 	bracket[1] = ']';
1116 	bracket[2] = '\0';
1117 	p_bracket(p);
1118 	assert(p->next == bracket+2);
1119 	p->next = oldnext;
1120 	p->end = oldend;
1121 }
1122 
1123 /*
1124  - ordinary - emit an ordinary character
1125  == static void ordinary(register struct parse *p, register int ch);
1126  */
1127 static void
ordinary(p,ch)1128 ordinary(p, ch)
1129 register struct parse *p;
1130 register int ch;
1131 {
1132 	register cat_t *cap = p->g->categories;
1133 
1134 	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
1135 		bothcases(p, ch);
1136 	else {
1137 		EMIT(OCHAR, (unsigned char)ch);
1138 		if (cap[ch] == 0)
1139 			cap[ch] = p->g->ncategories++;
1140 	}
1141 }
1142 
1143 /*
1144  - nonnewline - emit REG_NEWLINE version of OANY
1145  == static void nonnewline(register struct parse *p);
1146  *
1147  * Boy, is this implementation ever a kludge...
1148  */
1149 static void
nonnewline(p)1150 nonnewline(p)
1151 register struct parse *p;
1152 {
1153 	register char *oldnext = p->next;
1154 	register char *oldend = p->end;
1155 	char bracket[4];
1156 
1157 	p->next = bracket;
1158 	p->end = bracket+3;
1159 	bracket[0] = '^';
1160 	bracket[1] = '\n';
1161 	bracket[2] = ']';
1162 	bracket[3] = '\0';
1163 	p_bracket(p);
1164 	assert(p->next == bracket+3);
1165 	p->next = oldnext;
1166 	p->end = oldend;
1167 }
1168 
1169 /*
1170  - repeat - generate code for a bounded repetition, recursively if needed
1171  == static void repeat(register struct parse *p, sopno start, int from, int to);
1172  */
1173 static void
repeat(p,start,from,to)1174 repeat(p, start, from, to)
1175 register struct parse *p;
1176 sopno start;			/* operand from here to end of strip */
1177 int from;			/* repeated from this number */
1178 int to;				/* to this number of times (maybe INFINITY) */
1179 {
1180 	register sopno finish = HERE();
1181 #	define	N	2
1182 #	define	INF	3
1183 #	define	REP(f, t)	((f)*8 + (t))
1184 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1185 	register sopno copy;
1186 
1187 	if (p->error != 0)	/* head off possible runaway recursion */
1188 		return;
1189 
1190 	assert(from <= to);
1191 
1192 	switch (REP(MAP(from), MAP(to))) {
1193 	case REP(0, 0):			/* must be user doing this */
1194 		DROP(finish-start);	/* drop the operand */
1195 		break;
1196 	case REP(0, 1):			/* as x{1,1}? */
1197 	case REP(0, N):			/* as x{1,n}? */
1198 	case REP(0, INF):		/* as x{1,}? */
1199 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1200 		INSERT(OCH_, start);		/* offset is wrong... */
1201 		repeat(p, start+1, 1, to);
1202 		ASTERN(OOR1, start);
1203 		AHEAD(start);			/* ... fix it */
1204 		EMIT(OOR2, 0);
1205 		AHEAD(THERE());
1206 		ASTERN(O_CH, THERETHERE());
1207 		break;
1208 	case REP(1, 1):			/* trivial case */
1209 		/* done */
1210 		break;
1211 	case REP(1, N):			/* as x?x{1,n-1} */
1212 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1213 		INSERT(OCH_, start);
1214 		ASTERN(OOR1, start);
1215 		AHEAD(start);
1216 		EMIT(OOR2, 0);			/* offset very wrong... */
1217 		AHEAD(THERE());			/* ...so fix it */
1218 		ASTERN(O_CH, THERETHERE());
1219 		copy = dupl(p, start+1, finish+1);
1220 		assert(copy == finish+4);
1221 		repeat(p, copy, 1, to-1);
1222 		break;
1223 	case REP(1, INF):		/* as x+ */
1224 		INSERT(OPLUS_, start);
1225 		ASTERN(O_PLUS, start);
1226 		break;
1227 	case REP(N, N):			/* as xx{m-1,n-1} */
1228 		copy = dupl(p, start, finish);
1229 		repeat(p, copy, from-1, to-1);
1230 		break;
1231 	case REP(N, INF):		/* as xx{n-1,INF} */
1232 		copy = dupl(p, start, finish);
1233 		repeat(p, copy, from-1, to);
1234 		break;
1235 	default:			/* "can't happen" */
1236 		SETERROR(REG_ASSERT);	/* just in case */
1237 		break;
1238 	}
1239 }
1240 
1241 /*
1242  - seterr - set an error condition
1243  == static int seterr(register struct parse *p, int e);
1244  */
1245 static int			/* useless but makes type checking happy */
seterr(p,e)1246 seterr(p, e)
1247 register struct parse *p;
1248 int e;
1249 {
1250 	if (p->error == 0)	/* keep earliest error condition */
1251 		p->error = e;
1252 	p->next = nuls;		/* try to bring things to a halt */
1253 	p->end = nuls;
1254 	return(0);		/* make the return value well-defined */
1255 }
1256 
1257 /*
1258  - allocset - allocate a set of characters for []
1259  == static cset *allocset(register struct parse *p);
1260  */
1261 static cset *
allocset(p)1262 allocset(p)
1263 register struct parse *p;
1264 {
1265 	register int no = p->g->ncsets++;
1266 	register size_t nc;
1267 	register size_t nbytes;
1268 	register cset *cs;
1269 	register size_t css = (size_t)p->g->csetsize;
1270 	register int i;
1271 
1272 	if (no >= p->ncsalloc) {	/* need another column of space */
1273 		p->ncsalloc += CHAR_BIT;
1274 		nc = p->ncsalloc;
1275 		assert(nc % CHAR_BIT == 0);
1276 		nbytes = nc / CHAR_BIT * css;
1277 		if (p->g->sets == NULL)
1278 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1279 		else
1280 			p->g->sets = (cset *)realloc((char *)p->g->sets,
1281 							nc * sizeof(cset));
1282 		if (p->g->setbits == NULL)
1283 			p->g->setbits = (uch *)malloc(nbytes);
1284 		else {
1285 			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1286 								nbytes);
1287 			/* xxx this isn't right if setbits is now NULL */
1288 			for (i = 0; i < no; i++)
1289 				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1290 		}
1291 		if (p->g->sets != NULL && p->g->setbits != NULL)
1292 			(void) memset((char *)p->g->setbits + (nbytes - css),
1293 								0, css);
1294 		else {
1295 			no = 0;
1296 			SETERROR(REG_ESPACE);
1297 			/* caller's responsibility not to do set ops */
1298 		}
1299 	}
1300 
1301 	assert(p->g->sets != NULL);	/* xxx */
1302 	cs = &p->g->sets[no];
1303 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1304 	cs->mask = 1 << ((no) % CHAR_BIT);
1305 	cs->hash = 0;
1306 	cs->smultis = 0;
1307 	cs->multis = NULL;
1308 
1309 	return(cs);
1310 }
1311 
1312 /*
1313  - freeset - free a now-unused set
1314  == static void freeset(register struct parse *p, register cset *cs);
1315  */
1316 static void
freeset(p,cs)1317 freeset(p, cs)
1318 register struct parse *p;
1319 register cset *cs;
1320 {
1321 	register int i;
1322 	register cset *top = &p->g->sets[p->g->ncsets];
1323 	register size_t css = (size_t)p->g->csetsize;
1324 
1325 	for (i = 0; i < css; i++)
1326 		CHsub(cs, i);
1327 	if (cs == top-1)	/* recover only the easy case */
1328 		p->g->ncsets--;
1329 }
1330 
1331 /*
1332  - freezeset - final processing on a set of characters
1333  == static int freezeset(register struct parse *p, register cset *cs);
1334  *
1335  * The main task here is merging identical sets.  This is usually a waste
1336  * of time (although the hash code minimizes the overhead), but can win
1337  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1338  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1339  * the same value!
1340  */
1341 static int			/* set number */
freezeset(p,cs)1342 freezeset(p, cs)
1343 register struct parse *p;
1344 register cset *cs;
1345 {
1346 	register uch h = cs->hash;
1347 	register int i;
1348 	register cset *top = &p->g->sets[p->g->ncsets];
1349 	register cset *cs2;
1350 	register size_t css = (size_t)p->g->csetsize;
1351 
1352 	/* look for an earlier one which is the same */
1353 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1354 		if (cs2->hash == h && cs2 != cs) {
1355 			/* maybe */
1356 			for (i = 0; i < css; i++)
1357 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1358 					break;		/* no */
1359 			if (i == css)
1360 				break;			/* yes */
1361 		}
1362 
1363 	if (cs2 < top) {	/* found one */
1364 		freeset(p, cs);
1365 		cs = cs2;
1366 	}
1367 
1368 	return((int)(cs - p->g->sets));
1369 }
1370 
1371 /*
1372  - firstch - return first character in a set (which must have at least one)
1373  == static int firstch(register struct parse *p, register cset *cs);
1374  */
1375 static int			/* character; there is no "none" value */
firstch(p,cs)1376 firstch(p, cs)
1377 register struct parse *p;
1378 register cset *cs;
1379 {
1380 	register int i;
1381 	register size_t css = (size_t)p->g->csetsize;
1382 
1383 	for (i = 0; i < css; i++)
1384 		if (CHIN(cs, i))
1385 			return((char)i);
1386 	assert(never);
1387 	return(0);		/* arbitrary */
1388 }
1389 
1390 /*
1391  - nch - number of characters in a set
1392  == static int nch(register struct parse *p, register cset *cs);
1393  */
1394 static int
nch(p,cs)1395 nch(p, cs)
1396 register struct parse *p;
1397 register cset *cs;
1398 {
1399 	register int i;
1400 	register size_t css = (size_t)p->g->csetsize;
1401 	register int n = 0;
1402 
1403 	for (i = 0; i < css; i++)
1404 		if (CHIN(cs, i))
1405 			n++;
1406 	return(n);
1407 }
1408 
1409 /*
1410  - mcadd - add a collating element to a cset
1411  == static void mcadd(register struct parse *p, register cset *cs, \
1412  ==	register char *cp);
1413  */
1414 static void
mcadd(p,cs,cp)1415 mcadd(p, cs, cp)
1416 register struct parse *p;
1417 register cset *cs;
1418 register char *cp;
1419 {
1420 	register size_t oldend = cs->smultis;
1421 
1422 	cs->smultis += strlen(cp) + 1;
1423 	if (cs->multis == NULL)
1424 		cs->multis = malloc(cs->smultis);
1425 	else
1426 		cs->multis = realloc(cs->multis, cs->smultis);
1427 	if (cs->multis == NULL) {
1428 		SETERROR(REG_ESPACE);
1429 		return;
1430 	}
1431 
1432 	(void) strcpy(cs->multis + oldend - 1, cp);
1433 	cs->multis[cs->smultis - 1] = '\0';
1434 }
1435 
1436 /*
1437  - mcinvert - invert the list of collating elements in a cset
1438  == static void mcinvert(register struct parse *p, register cset *cs);
1439  *
1440  * This would have to know the set of possibilities.  Implementation
1441  * is deferred.
1442  */
1443 static void
mcinvert(p,cs)1444 mcinvert(p, cs)
1445 register struct parse *p;
1446 register cset *cs;
1447 {
1448 	assert(cs->multis == NULL);	/* xxx */
1449 }
1450 
1451 /*
1452  - mccase - add case counterparts of the list of collating elements in a cset
1453  == static void mccase(register struct parse *p, register cset *cs);
1454  *
1455  * This would have to know the set of possibilities.  Implementation
1456  * is deferred.
1457  */
1458 static void
mccase(p,cs)1459 mccase(p, cs)
1460 register struct parse *p;
1461 register cset *cs;
1462 {
1463 	assert(cs->multis == NULL);	/* xxx */
1464 }
1465 
1466 /*
1467  - isinsets - is this character in any sets?
1468  == static int isinsets(register struct re_guts *g, int c);
1469  */
1470 static int			/* predicate */
isinsets(g,c)1471 isinsets(g, c)
1472 register struct re_guts *g;
1473 int c;
1474 {
1475 	register uch *col;
1476 	register int i;
1477 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1478 	register unsigned uc = (unsigned char)c;
1479 
1480 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1481 		if (col[uc] != 0)
1482 			return(1);
1483 	return(0);
1484 }
1485 
1486 /*
1487  - samesets - are these two characters in exactly the same sets?
1488  == static int samesets(register struct re_guts *g, int c1, int c2);
1489  */
1490 static int			/* predicate */
samesets(g,c1,c2)1491 samesets(g, c1, c2)
1492 register struct re_guts *g;
1493 int c1;
1494 int c2;
1495 {
1496 	register uch *col;
1497 	register int i;
1498 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1499 	register unsigned uc1 = (unsigned char)c1;
1500 	register unsigned uc2 = (unsigned char)c2;
1501 
1502 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1503 		if (col[uc1] != col[uc2])
1504 			return(0);
1505 	return(1);
1506 }
1507 
1508 /*
1509  - categorize - sort out character categories
1510  == static void categorize(struct parse *p, register struct re_guts *g);
1511  */
1512 static void
categorize(p,g)1513 categorize(p, g)
1514 struct parse *p;
1515 register struct re_guts *g;
1516 {
1517 	register cat_t *cats = g->categories;
1518 	register int c;
1519 	register int c2;
1520 	register cat_t cat;
1521 
1522 	/* avoid making error situations worse */
1523 	if (p->error != 0)
1524 		return;
1525 
1526 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1527 		if (cats[c] == 0 && isinsets(g, c)) {
1528 			cat = g->ncategories++;
1529 			cats[c] = cat;
1530 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1531 				if (cats[c2] == 0 && samesets(g, c, c2))
1532 					cats[c2] = cat;
1533 		}
1534 }
1535 
1536 /*
1537  - dupl - emit a duplicate of a bunch of sops
1538  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1539  */
1540 static sopno			/* start of duplicate */
dupl(p,start,finish)1541 dupl(p, start, finish)
1542 register struct parse *p;
1543 sopno start;			/* from here */
1544 sopno finish;			/* to this less one */
1545 {
1546 	register sopno ret = HERE();
1547 	register sopno len = finish - start;
1548 
1549 	assert(finish >= start);
1550 	if (len == 0)
1551 		return(ret);
1552 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1553 	assert(p->ssize >= p->slen + len);
1554 	(void) memcpy((char *)(p->strip + p->slen),
1555 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1556 	p->slen += len;
1557 	return(ret);
1558 }
1559 
1560 /*
1561  - doemit - emit a strip operator
1562  == static void doemit(register struct parse *p, sop op, size_t opnd);
1563  *
1564  * It might seem better to implement this as a macro with a function as
1565  * hard-case backup, but it's just too big and messy unless there are
1566  * some changes to the data structures.  Maybe later.
1567  */
1568 static void
doemit(p,op,opnd)1569 doemit(p, op, opnd)
1570 register struct parse *p;
1571 sop op;
1572 size_t opnd;
1573 {
1574 	/* avoid making error situations worse */
1575 	if (p->error != 0)
1576 		return;
1577 
1578 	/* deal with oversize operands ("can't happen", more or less) */
1579 	assert(opnd < 1<<OPSHIFT);
1580 
1581 	/* deal with undersized strip */
1582 	if (p->slen >= p->ssize)
1583 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1584 	assert(p->slen < p->ssize);
1585 
1586 	/* finally, it's all reduced to the easy case */
1587 	p->strip[p->slen++] = SOP(op, opnd);
1588 }
1589 
1590 /*
1591  - doinsert - insert a sop into the strip
1592  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1593  */
1594 static void
doinsert(p,op,opnd,pos)1595 doinsert(p, op, opnd, pos)
1596 register struct parse *p;
1597 sop op;
1598 size_t opnd;
1599 sopno pos;
1600 {
1601 	register sopno sn;
1602 	register sop s;
1603 	register int i;
1604 
1605 	/* avoid making error situations worse */
1606 	if (p->error != 0)
1607 		return;
1608 
1609 	sn = HERE();
1610 	EMIT(op, opnd);		/* do checks, ensure space */
1611 	assert(HERE() == sn+1);
1612 	s = p->strip[sn];
1613 
1614 	/* adjust paren pointers */
1615 	assert(pos > 0);
1616 	for (i = 1; i < NPAREN; i++) {
1617 		if (p->pbegin[i] >= pos) {
1618 			p->pbegin[i]++;
1619 		}
1620 		if (p->pend[i] >= pos) {
1621 			p->pend[i]++;
1622 		}
1623 	}
1624 
1625 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1626 						(HERE()-pos-1)*sizeof(sop));
1627 	p->strip[pos] = s;
1628 }
1629 
1630 /*
1631  - dofwd - complete a forward reference
1632  == static void dofwd(register struct parse *p, sopno pos, sop value);
1633  */
1634 static void
dofwd(p,pos,value)1635 dofwd(p, pos, value)
1636 register struct parse *p;
1637 register sopno pos;
1638 sop value;
1639 {
1640 	/* avoid making error situations worse */
1641 	if (p->error != 0)
1642 		return;
1643 
1644 	assert(value < 1<<OPSHIFT);
1645 	p->strip[pos] = OP(p->strip[pos]) | value;
1646 }
1647 
1648 /*
1649  - enlarge - enlarge the strip
1650  == static void enlarge(register struct parse *p, sopno size);
1651  */
1652 static void
enlarge(p,size)1653 enlarge(p, size)
1654 register struct parse *p;
1655 register sopno size;
1656 {
1657 	register sop *sp;
1658 
1659 	if (p->ssize >= size)
1660 		return;
1661 
1662 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1663 	if (sp == NULL) {
1664 		SETERROR(REG_ESPACE);
1665 		return;
1666 	}
1667 	p->strip = sp;
1668 	p->ssize = size;
1669 }
1670 
1671 /*
1672  - stripsnug - compact the strip
1673  == static void stripsnug(register struct parse *p, register struct re_guts *g);
1674  */
1675 static void
stripsnug(p,g)1676 stripsnug(p, g)
1677 register struct parse *p;
1678 register struct re_guts *g;
1679 {
1680 	g->nstates = p->slen;
1681 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1682 	if (g->strip == NULL) {
1683 		SETERROR(REG_ESPACE);
1684 		g->strip = p->strip;
1685 	}
1686 }
1687 
1688 /*
1689  - findmust - fill in must and mlen with longest mandatory literal string
1690  == static void findmust(register struct parse *p, register struct re_guts *g);
1691  *
1692  * This algorithm could do fancy things like analyzing the operands of |
1693  * for common subsequences.  Someday.  This code is simple and finds most
1694  * of the interesting cases.
1695  *
1696  * Note that must and mlen got initialized during setup.
1697  */
1698 static void
findmust(p,g)1699 findmust(p, g)
1700 struct parse *p;
1701 register struct re_guts *g;
1702 {
1703 	register sop *scan;
1704 	sop *start;
1705 	register sop *newstart;
1706 	register sopno newlen;
1707 	register sop s;
1708 	register char *cp;
1709 	register sopno i;
1710 
1711 	/* avoid making error situations worse */
1712 	if (p->error != 0)
1713 		return;
1714 
1715 	/* find the longest OCHAR sequence in strip */
1716 	newlen = 0;
1717 	scan = g->strip + 1;
1718 	do {
1719 		s = *scan++;
1720 		switch (OP(s)) {
1721 		case OCHAR:		/* sequence member */
1722 			if (newlen == 0)		/* new sequence */
1723 				newstart = scan - 1;
1724 			newlen++;
1725 			break;
1726 		case OPLUS_:		/* things that don't break one */
1727 		case OLPAREN:
1728 		case ORPAREN:
1729 			break;
1730 		case OQUEST_:		/* things that must be skipped */
1731 		case OCH_:
1732 			scan--;
1733 			do {
1734 				scan += OPND(s);
1735 				s = *scan;
1736 				/* assert() interferes w debug printouts */
1737 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1738 							OP(s) != OOR2) {
1739 					g->iflags |= BAD;
1740 					return;
1741 				}
1742 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1743 			/* fallthrough */
1744 		default:		/* things that break a sequence */
1745 			if (newlen > g->mlen) {		/* ends one */
1746 				start = newstart;
1747 				g->mlen = newlen;
1748 			}
1749 			newlen = 0;
1750 			break;
1751 		}
1752 	} while (OP(s) != OEND);
1753 
1754 	if (g->mlen == 0)		/* there isn't one */
1755 		return;
1756 
1757 	/* turn it into a character string */
1758 	g->must = malloc((size_t)g->mlen + 1);
1759 	if (g->must == NULL) {		/* argh; just forget it */
1760 		g->mlen = 0;
1761 		return;
1762 	}
1763 	cp = g->must;
1764 	scan = start;
1765 	for (i = g->mlen; i > 0; i--) {
1766 		while (OP(s = *scan++) != OCHAR)
1767 			continue;
1768 		assert(cp < g->must + g->mlen);
1769 		*cp++ = (char)OPND(s);
1770 	}
1771 	assert(cp == g->must + g->mlen);
1772 	*cp++ = '\0';		/* just on general principles */
1773 }
1774 
1775 /*
1776  - pluscount - count + nesting
1777  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1778  */
1779 static sopno			/* nesting depth */
pluscount(p,g)1780 pluscount(p, g)
1781 struct parse *p;
1782 register struct re_guts *g;
1783 {
1784 	register sop *scan;
1785 	register sop s;
1786 	register sopno plusnest = 0;
1787 	register sopno maxnest = 0;
1788 
1789 	if (p->error != 0)
1790 		return(0);	/* there may not be an OEND */
1791 
1792 	scan = g->strip + 1;
1793 	do {
1794 		s = *scan++;
1795 		switch (OP(s)) {
1796 		case OPLUS_:
1797 			plusnest++;
1798 			break;
1799 		case O_PLUS:
1800 			if (plusnest > maxnest)
1801 				maxnest = plusnest;
1802 			plusnest--;
1803 			break;
1804 		}
1805 	} while (OP(s) != OEND);
1806 	if (plusnest != 0)
1807 		g->iflags |= BAD;
1808 	return(maxnest);
1809 }
1810 
1811 static int nope = 0;		/* for use in asserts; shuts lint up */
1812 
1813 /* macros for manipulating states, small version */
1814 #define	states	unsigned
1815 #define	states1	unsigned	/* for later use in regexec() decision */
1816 #define	CLEAR(v)	((v) = 0)
1817 #define	SET0(v, n)	((v) &= ~((unsigned)1 << (n)))
1818 #define	SET1(v, n)	((v) |= (unsigned)1 << (n))
1819 #define	ISSET(v, n)	((v) & ((unsigned)1 << (n)))
1820 #define	ASSIGN(d, s)	((d) = (s))
1821 #define	EQ(a, b)	((a) == (b))
1822 #define	STATEVARS	int dummy	/* dummy version */
1823 #define	STATESETUP(m, n)	/* nothing */
1824 #define	STATETEARDOWN(m)	/* nothing */
1825 #define	SETUP(v)	((v) = 0)
1826 #define	onestate	unsigned
1827 #define	INIT(o, n)	((o) = (unsigned)1 << (n))
1828 #define	INC(o)	((o) <<= 1)
1829 #define	ISSTATEIN(v, o)	((v) & (o))
1830 /* some abbreviations; note that some of these know variable names! */
1831 /* do "if I'm here, I can also be there" etc without branches */
1832 #define	FWD(dst, src, n)	((dst) |= ((unsigned)(src)&(here)) << (n))
1833 #define	BACK(dst, src, n)	((dst) |= ((unsigned)(src)&(here)) >> (n))
1834 #define	ISSETBACK(v, n)	((v) & ((unsigned)here >> (n)))
1835 /* function names */
1836 #define SNAMES			/* engine.c looks after details */
1837 
1838 /*
1839  * The matching engine and friends.  This file is #included by regexec.c
1840  * after suitable #defines of a variety of macros used herein, so that
1841  * different state representations can be used without duplicating masses
1842  * of code.
1843  */
1844 
1845 #ifdef SNAMES
1846 #define	matcher	smatcher
1847 #define	fast	sfast
1848 #define	slow	sslow
1849 #define	dissect	sdissect
1850 #define	backref	sbackref
1851 #define	step	sstep
1852 #define	print	sprint
1853 #define	at	sat
1854 #define	match	smat
1855 #endif
1856 #ifdef LNAMES
1857 #define	matcher	lmatcher
1858 #define	fast	lfast
1859 #define	slow	lslow
1860 #define	dissect	ldissect
1861 #define	backref	lbackref
1862 #define	step	lstep
1863 #define	print	lprint
1864 #define	at	lat
1865 #define	match	lmat
1866 #endif
1867 
1868 /* another structure passed up and down to avoid zillions of parameters */
1869 struct match {
1870 	struct re_guts *g;
1871 	int eflags;
1872 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
1873 	char *offp;		/* offsets work from here */
1874 	char *beginp;		/* start of string -- virtual NUL precedes */
1875 	char *endp;		/* end of string -- virtual NUL here */
1876 	char *coldp;		/* can be no match starting before here */
1877 	char **lastpos;		/* [nplus+1] */
1878 	STATEVARS;
1879 	states st;		/* current states */
1880 	states fresh;		/* states for a fresh start */
1881 	states tmp;		/* temporary */
1882 	states empty;		/* empty set of states */
1883 };
1884 
1885 static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
1886 static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1887 static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
1888 static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1889 static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
1890 static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft);
1891 #define	BOL	(OUT+1)
1892 #define	EOL	(BOL+1)
1893 #define	BOLEOL	(BOL+2)
1894 #define	NOTHING	(BOL+3)
1895 #define	BOW	(BOL+4)
1896 #define	EOW	(BOL+5)
1897 #define	CODEMAX	(BOL+5)		/* highest code used */
1898 #define	NONCHAR(c)	((c) > CHAR_MAX)
1899 #define	NNONCHAR	(CODEMAX-CHAR_MAX)
1900 #define	SP(t, s, c)	/* nothing */
1901 #define	AT(t, p1, p2, s1, s2)	/* nothing */
1902 #define	NOTE(s)	/* nothing */
1903 
1904 /*
1905  - matcher - the actual matching engine
1906  == static int matcher(register struct re_guts *g, char *string, \
1907  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
1908  */
1909 static int			/* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)1910 matcher(g, string, nmatch, pmatch, eflags)
1911 register struct re_guts *g;
1912 char *string;
1913 size_t nmatch;
1914 regmatch_t pmatch[];
1915 int eflags;
1916 {
1917 	register char *endp;
1918 	register int i;
1919 	struct match mv;
1920 	register struct match *m = &mv;
1921 	register char *dp;
1922 	const register sopno gf = g->firststate+1;	/* +1 for OEND */
1923 	const register sopno gl = g->laststate;
1924 	char *start;
1925 	char *stop;
1926 
1927 	/* simplify the situation where possible */
1928 	if (g->cflags&REG_NOSUB)
1929 		nmatch = 0;
1930 	if (eflags&REG_STARTEND) {
1931 		start = string + pmatch[0].rm_so;
1932 		stop = string + pmatch[0].rm_eo;
1933 	} else {
1934 		start = string;
1935 		stop = start + strlen(start);
1936 	}
1937 	if (stop < start)
1938 		return(REG_INVARG);
1939 
1940 	/* prescreening; this does wonders for this rather slow code */
1941 	if (g->must != NULL) {
1942 		for (dp = start; dp < stop; dp++)
1943 			if (*dp == g->must[0] && stop - dp >= g->mlen &&
1944 				memcmp(dp, g->must, (size_t)g->mlen) == 0)
1945 				break;
1946 		if (dp == stop)		/* we didn't find g->must */
1947 			return(REG_NOMATCH);
1948 	}
1949 
1950 	/* match struct setup */
1951 	m->g = g;
1952 	m->eflags = eflags;
1953 	m->pmatch = NULL;
1954 	m->lastpos = NULL;
1955 	m->offp = string;
1956 	m->beginp = start;
1957 	m->endp = stop;
1958 	STATESETUP(m, 4);
1959 	SETUP(m->st);
1960 	SETUP(m->fresh);
1961 	SETUP(m->tmp);
1962 	SETUP(m->empty);
1963 	CLEAR(m->empty);
1964 
1965 	/* this loop does only one repetition except for backrefs */
1966 	for (;;) {
1967 		endp = fast(m, start, stop, gf, gl);
1968 		if (endp == NULL) {		/* a miss */
1969 			STATETEARDOWN(m);
1970 			return(REG_NOMATCH);
1971 		}
1972 		if (nmatch == 0 && !g->backrefs)
1973 			break;		/* no further info needed */
1974 
1975 		/* where? */
1976 		assert(m->coldp != NULL);
1977 		for (;;) {
1978 			NOTE("finding start");
1979 			endp = slow(m, m->coldp, stop, gf, gl);
1980 			if (endp != NULL)
1981 				break;
1982 			assert(m->coldp < m->endp);
1983 			m->coldp++;
1984 		}
1985 		if (nmatch == 1 && !g->backrefs)
1986 			break;		/* no further info needed */
1987 
1988 		/* oh my, he wants the subexpressions... */
1989 		if (m->pmatch == NULL)
1990 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
1991 							sizeof(regmatch_t));
1992 		if (m->pmatch == NULL) {
1993 			STATETEARDOWN(m);
1994 			return(REG_ESPACE);
1995 		}
1996 		for (i = 1; i <= m->g->nsub; i++)
1997 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
1998 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
1999 			NOTE("dissecting");
2000 			dp = dissect(m, m->coldp, endp, gf, gl);
2001 		} else {
2002 			if (g->nplus > 0 && m->lastpos == NULL)
2003 				m->lastpos = (char **)malloc((g->nplus+1) *
2004 							sizeof(char *));
2005 			if (g->nplus > 0 && m->lastpos == NULL) {
2006 				free(m->pmatch);
2007 				STATETEARDOWN(m);
2008 				return(REG_ESPACE);
2009 			}
2010 			NOTE("backref dissect");
2011 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2012 		}
2013 		if (dp != NULL)
2014 			break;
2015 
2016 		/* uh-oh... we couldn't find a subexpression-level match */
2017 		assert(g->backrefs);	/* must be back references doing it */
2018 		assert(g->nplus == 0 || m->lastpos != NULL);
2019 		for (;;) {
2020 			if (dp != NULL || endp <= m->coldp)
2021 				break;		/* defeat */
2022 			NOTE("backoff");
2023 			endp = slow(m, m->coldp, endp-1, gf, gl);
2024 			if (endp == NULL)
2025 				break;		/* defeat */
2026 			/* try it on a shorter possibility */
2027 			NOTE("backoff dissect");
2028 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2029 		}
2030 		assert(dp == NULL || dp == endp);
2031 		if (dp != NULL)		/* found a shorter one */
2032 			break;
2033 
2034 		/* despite initial appearances, there is no match here */
2035 		NOTE("false alarm");
2036 		start = m->coldp + 1;	/* recycle starting later */
2037 		assert(start <= stop);
2038 	}
2039 
2040 	/* fill in the details if requested */
2041 	if (nmatch > 0) {
2042 		pmatch[0].rm_so = m->coldp - m->offp;
2043 		pmatch[0].rm_eo = endp - m->offp;
2044 	}
2045 	if (nmatch > 1) {
2046 		assert(m->pmatch != NULL);
2047 		for (i = 1; i < nmatch; i++)
2048 			if (i <= m->g->nsub)
2049 				pmatch[i] = m->pmatch[i];
2050 			else {
2051 				pmatch[i].rm_so = -1;
2052 				pmatch[i].rm_eo = -1;
2053 			}
2054 	}
2055 
2056 	if (m->pmatch != NULL)
2057 		free((char *)m->pmatch);
2058 	if (m->lastpos != NULL)
2059 		free((char *)m->lastpos);
2060 	STATETEARDOWN(m);
2061 	return(0);
2062 }
2063 
2064 /*
2065  - dissect - figure out what matched what, no back references
2066  == static char *dissect(register struct match *m, char *start, \
2067  ==	char *stop, sopno startst, sopno stopst);
2068  */
2069 static char *			/* == stop (success) always */
dissect(m,start,stop,startst,stopst)2070 dissect(m, start, stop, startst, stopst)
2071 register struct match *m;
2072 char *start;
2073 char *stop;
2074 sopno startst;
2075 sopno stopst;
2076 {
2077 	register int i;
2078 	register sopno ss;	/* start sop of current subRE */
2079 	register sopno es;	/* end sop of current subRE */
2080 	register char *sp;	/* start of string matched by it */
2081 	register char *stp;	/* string matched by it cannot pass here */
2082 	register char *rest;	/* start of rest of string */
2083 	register char *tail;	/* string unmatched by rest of RE */
2084 	register sopno ssub;	/* start sop of subsubRE */
2085 	register sopno esub;	/* end sop of subsubRE */
2086 	register char *ssp;	/* start of string matched by subsubRE */
2087 	register char *sep;	/* end of string matched by subsubRE */
2088 	register char *oldssp;	/* previous ssp */
2089 	register char *dp;
2090 
2091 	AT("diss", start, stop, startst, stopst);
2092 	sp = start;
2093 	for (ss = startst; ss < stopst; ss = es) {
2094 		/* identify end of subRE */
2095 		es = ss;
2096 		switch (OP(m->g->strip[es])) {
2097 		case OPLUS_:
2098 		case OQUEST_:
2099 			es += OPND(m->g->strip[es]);
2100 			break;
2101 		case OCH_:
2102 			while (OP(m->g->strip[es]) != O_CH)
2103 				es += OPND(m->g->strip[es]);
2104 			break;
2105 		}
2106 		es++;
2107 
2108 		/* figure out what it matched */
2109 		switch (OP(m->g->strip[ss])) {
2110 		case OEND:
2111 			assert(nope);
2112 			break;
2113 		case OCHAR:
2114 			sp++;
2115 			break;
2116 		case OBOL:
2117 		case OEOL:
2118 		case OBOW:
2119 		case OEOW:
2120 			break;
2121 		case OANY:
2122 		case OANYOF:
2123 			sp++;
2124 			break;
2125 		case OBACK_:
2126 		case O_BACK:
2127 			assert(nope);
2128 			break;
2129 		/* cases where length of match is hard to find */
2130 		case OQUEST_:
2131 			stp = stop;
2132 			for (;;) {
2133 				/* how long could this one be? */
2134 				rest = slow(m, sp, stp, ss, es);
2135 				assert(rest != NULL);	/* it did match */
2136 				/* could the rest match the rest? */
2137 				tail = slow(m, rest, stop, es, stopst);
2138 				if (tail == stop)
2139 					break;		/* yes! */
2140 				/* no -- try a shorter match for this one */
2141 				stp = rest - 1;
2142 				assert(stp >= sp);	/* it did work */
2143 			}
2144 			ssub = ss + 1;
2145 			esub = es - 1;
2146 			/* did innards match? */
2147 			if (slow(m, sp, rest, ssub, esub) != NULL) {
2148 				dp = dissect(m, sp, rest, ssub, esub);
2149 				assert(dp == rest);
2150 			} else		/* no */
2151 				assert(sp == rest);
2152 			sp = rest;
2153 			break;
2154 		case OPLUS_:
2155 			stp = stop;
2156 			for (;;) {
2157 				/* how long could this one be? */
2158 				rest = slow(m, sp, stp, ss, es);
2159 				assert(rest != NULL);	/* it did match */
2160 				/* could the rest match the rest? */
2161 				tail = slow(m, rest, stop, es, stopst);
2162 				if (tail == stop)
2163 					break;		/* yes! */
2164 				/* no -- try a shorter match for this one */
2165 				stp = rest - 1;
2166 				assert(stp >= sp);	/* it did work */
2167 			}
2168 			ssub = ss + 1;
2169 			esub = es - 1;
2170 			ssp = sp;
2171 			oldssp = ssp;
2172 			for (;;) {	/* find last match of innards */
2173 				sep = slow(m, ssp, rest, ssub, esub);
2174 				if (sep == NULL || sep == ssp)
2175 					break;	/* failed or matched null */
2176 				oldssp = ssp;	/* on to next try */
2177 				ssp = sep;
2178 			}
2179 			if (sep == NULL) {
2180 				/* last successful match */
2181 				sep = ssp;
2182 				ssp = oldssp;
2183 			}
2184 			assert(sep == rest);	/* must exhaust substring */
2185 			assert(slow(m, ssp, sep, ssub, esub) == rest);
2186 			dp = dissect(m, ssp, sep, ssub, esub);
2187 			assert(dp == sep);
2188 			sp = rest;
2189 			break;
2190 		case OCH_:
2191 			stp = stop;
2192 			for (;;) {
2193 				/* how long could this one be? */
2194 				rest = slow(m, sp, stp, ss, es);
2195 				assert(rest != NULL);	/* it did match */
2196 				/* could the rest match the rest? */
2197 				tail = slow(m, rest, stop, es, stopst);
2198 				if (tail == stop)
2199 					break;		/* yes! */
2200 				/* no -- try a shorter match for this one */
2201 				stp = rest - 1;
2202 				assert(stp >= sp);	/* it did work */
2203 			}
2204 			ssub = ss + 1;
2205 			esub = ss + OPND(m->g->strip[ss]) - 1;
2206 			assert(OP(m->g->strip[esub]) == OOR1);
2207 			for (;;) {	/* find first matching branch */
2208 				if (slow(m, sp, rest, ssub, esub) == rest)
2209 					break;	/* it matched all of it */
2210 				/* that one missed, try next one */
2211 				assert(OP(m->g->strip[esub]) == OOR1);
2212 				esub++;
2213 				assert(OP(m->g->strip[esub]) == OOR2);
2214 				ssub = esub + 1;
2215 				esub += OPND(m->g->strip[esub]);
2216 				if (OP(m->g->strip[esub]) == OOR2)
2217 					esub--;
2218 				else
2219 					assert(OP(m->g->strip[esub]) == O_CH);
2220 			}
2221 			dp = dissect(m, sp, rest, ssub, esub);
2222 			assert(dp == rest);
2223 			sp = rest;
2224 			break;
2225 		case O_PLUS:
2226 		case O_QUEST:
2227 		case OOR1:
2228 		case OOR2:
2229 		case O_CH:
2230 			assert(nope);
2231 			break;
2232 		case OLPAREN:
2233 			i = OPND(m->g->strip[ss]);
2234 			assert(0 < i && i <= m->g->nsub);
2235 			m->pmatch[i].rm_so = sp - m->offp;
2236 			break;
2237 		case ORPAREN:
2238 			i = OPND(m->g->strip[ss]);
2239 			assert(0 < i && i <= m->g->nsub);
2240 			m->pmatch[i].rm_eo = sp - m->offp;
2241 			break;
2242 		default:		/* uh oh */
2243 			assert(nope);
2244 			break;
2245 		}
2246 	}
2247 
2248 	assert(sp == stop);
2249 	return(sp);
2250 }
2251 
2252 /*
2253  - backref - figure out what matched what, figuring in back references
2254  == static char *backref(register struct match *m, char *start, \
2255  ==	char *stop, sopno startst, sopno stopst, sopno lev);
2256  */
2257 static char *			/* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)2258 backref(m, start, stop, startst, stopst, lev)
2259 register struct match *m;
2260 char *start;
2261 char *stop;
2262 sopno startst;
2263 sopno stopst;
2264 sopno lev;			/* PLUS nesting level */
2265 {
2266 	register int i;
2267 	register sopno ss;	/* start sop of current subRE */
2268 	register char *sp;	/* start of string matched by it */
2269 	register sopno ssub;	/* start sop of subsubRE */
2270 	register sopno esub;	/* end sop of subsubRE */
2271 	register char *ssp;	/* start of string matched by subsubRE */
2272 	register char *dp;
2273 	register size_t len;
2274 	register int hard;
2275 	register sop s;
2276 	register regoff_t offsave;
2277 	register cset *cs;
2278 
2279 	AT("back", start, stop, startst, stopst);
2280 	sp = start;
2281 
2282 	/* get as far as we can with easy stuff */
2283 	hard = 0;
2284 	for (ss = startst; !hard && ss < stopst; ss++)
2285 		switch (OP(s = m->g->strip[ss])) {
2286 		case OCHAR:
2287 			if (sp == stop || *sp++ != (char)OPND(s))
2288 				return(NULL);
2289 			break;
2290 		case OANY:
2291 			if (sp == stop)
2292 				return(NULL);
2293 			sp++;
2294 			break;
2295 		case OANYOF:
2296 			cs = &m->g->sets[OPND(s)];
2297 			if (sp == stop || !CHIN(cs, *sp++))
2298 				return(NULL);
2299 			break;
2300 		case OBOL:
2301 			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
2302 					(sp < m->endp && *(sp-1) == '\n' &&
2303 						(m->g->cflags&REG_NEWLINE)) )
2304 				{ /* yes */ }
2305 			else
2306 				return(NULL);
2307 			break;
2308 		case OEOL:
2309 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
2310 					(sp < m->endp && *sp == '\n' &&
2311 						(m->g->cflags&REG_NEWLINE)) )
2312 				{ /* yes */ }
2313 			else
2314 				return(NULL);
2315 			break;
2316 		case OBOW:
2317 			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
2318 					(sp < m->endp && *(sp-1) == '\n' &&
2319 						(m->g->cflags&REG_NEWLINE)) ||
2320 					(sp > m->beginp &&
2321 							!ISWORD(*(sp-1))) ) &&
2322 					(sp < m->endp && ISWORD(*sp)) )
2323 				{ /* yes */ }
2324 			else
2325 				return(NULL);
2326 			break;
2327 		case OEOW:
2328 			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
2329 					(sp < m->endp && *sp == '\n' &&
2330 						(m->g->cflags&REG_NEWLINE)) ||
2331 					(sp < m->endp && !ISWORD(*sp)) ) &&
2332 					(sp > m->beginp && ISWORD(*(sp-1))) )
2333 				{ /* yes */ }
2334 			else
2335 				return(NULL);
2336 			break;
2337 		case O_QUEST:
2338 			break;
2339 		case OOR1:	/* matches null but needs to skip */
2340 			ss++;
2341 			s = m->g->strip[ss];
2342 			do {
2343 				assert(OP(s) == OOR2);
2344 				ss += OPND(s);
2345 			} while (OP(s = m->g->strip[ss]) != O_CH);
2346 			/* note that the ss++ gets us past the O_CH */
2347 			break;
2348 		default:	/* have to make a choice */
2349 			hard = 1;
2350 			break;
2351 		}
2352 	if (!hard) {		/* that was it! */
2353 		if (sp != stop)
2354 			return(NULL);
2355 		return(sp);
2356 	}
2357 	ss--;			/* adjust for the for's final increment */
2358 
2359 	/* the hard stuff */
2360 	AT("hard", sp, stop, ss, stopst);
2361 	s = m->g->strip[ss];
2362 	switch (OP(s)) {
2363 	case OBACK_:		/* the vilest depths */
2364 		i = OPND(s);
2365 		assert(0 < i && i <= m->g->nsub);
2366 		if (m->pmatch[i].rm_eo == -1)
2367 			return(NULL);
2368 		assert(m->pmatch[i].rm_so != -1);
2369 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
2370 		assert(stop - m->beginp >= len);
2371 		if (sp > stop - len)
2372 			return(NULL);	/* not enough left to match */
2373 		ssp = m->offp + m->pmatch[i].rm_so;
2374 		if (memcmp(sp, ssp, len) != 0)
2375 			return(NULL);
2376 		while (m->g->strip[ss] != SOP(O_BACK, i))
2377 			ss++;
2378 		return(backref(m, sp+len, stop, ss+1, stopst, lev));
2379 		break;
2380 	case OQUEST_:		/* to null or not */
2381 		dp = backref(m, sp, stop, ss+1, stopst, lev);
2382 		if (dp != NULL)
2383 			return(dp);	/* not */
2384 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
2385 		break;
2386 	case OPLUS_:
2387 		assert(m->lastpos != NULL);
2388 		assert(lev+1 <= m->g->nplus);
2389 		m->lastpos[lev+1] = sp;
2390 		return(backref(m, sp, stop, ss+1, stopst, lev+1));
2391 		break;
2392 	case O_PLUS:
2393 		if (sp == m->lastpos[lev])	/* last pass matched null */
2394 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
2395 		/* try another pass */
2396 		m->lastpos[lev] = sp;
2397 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
2398 		if (dp == NULL)
2399 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
2400 		else
2401 			return(dp);
2402 		break;
2403 	case OCH_:		/* find the right one, if any */
2404 		ssub = ss + 1;
2405 		esub = ss + OPND(s) - 1;
2406 		assert(OP(m->g->strip[esub]) == OOR1);
2407 		for (;;) {	/* find first matching branch */
2408 			dp = backref(m, sp, stop, ssub, esub, lev);
2409 			if (dp != NULL)
2410 				return(dp);
2411 			/* that one missed, try next one */
2412 			if (OP(m->g->strip[esub]) == O_CH)
2413 				return(NULL);	/* there is none */
2414 			esub++;
2415 			assert(OP(m->g->strip[esub]) == OOR2);
2416 			ssub = esub + 1;
2417 			esub += OPND(m->g->strip[esub]);
2418 			if (OP(m->g->strip[esub]) == OOR2)
2419 				esub--;
2420 			else
2421 				assert(OP(m->g->strip[esub]) == O_CH);
2422 		}
2423 		break;
2424 	case OLPAREN:		/* must undo assignment if rest fails */
2425 		i = OPND(s);
2426 		assert(0 < i && i <= m->g->nsub);
2427 		offsave = m->pmatch[i].rm_so;
2428 		m->pmatch[i].rm_so = sp - m->offp;
2429 		dp = backref(m, sp, stop, ss+1, stopst, lev);
2430 		if (dp != NULL)
2431 			return(dp);
2432 		m->pmatch[i].rm_so = offsave;
2433 		return(NULL);
2434 		break;
2435 	case ORPAREN:		/* must undo assignment if rest fails */
2436 		i = OPND(s);
2437 		assert(0 < i && i <= m->g->nsub);
2438 		offsave = m->pmatch[i].rm_eo;
2439 		m->pmatch[i].rm_eo = sp - m->offp;
2440 		dp = backref(m, sp, stop, ss+1, stopst, lev);
2441 		if (dp != NULL)
2442 			return(dp);
2443 		m->pmatch[i].rm_eo = offsave;
2444 		return(NULL);
2445 		break;
2446 	default:		/* uh oh */
2447 		assert(nope);
2448 		break;
2449 	}
2450 
2451 	/* "can't happen" */
2452 	assert(nope);
2453 	/* NOTREACHED */
2454 	return((char *)NULL);	/* dummy */
2455 }
2456 
2457 /*
2458  - fast - step through the string at top speed
2459  == static char *fast(register struct match *m, char *start, \
2460  ==	char *stop, sopno startst, sopno stopst);
2461  */
2462 static char *			/* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)2463 fast(m, start, stop, startst, stopst)
2464 register struct match *m;
2465 char *start;
2466 char *stop;
2467 sopno startst;
2468 sopno stopst;
2469 {
2470 	register states st = m->st;
2471 	register states fresh = m->fresh;
2472 	register states tmp = m->tmp;
2473 	register char *p = start;
2474 	register int c = (start == m->beginp) ? OUT : *(start-1);
2475 	register int lastc;	/* previous c */
2476 	register int flagch;
2477 	register int i;
2478 	register char *coldp;	/* last p after which no match was underway */
2479 
2480 	CLEAR(st);
2481 	SET1(st, startst);
2482 	st = step(m->g, startst, stopst, st, NOTHING, st);
2483 	ASSIGN(fresh, st);
2484 	SP("start", st, *p);
2485 	coldp = NULL;
2486 	for (;;) {
2487 		/* next character */
2488 		lastc = c;
2489 		c = (p == m->endp) ? OUT : *p;
2490 		if (EQ(st, fresh))
2491 			coldp = p;
2492 
2493 		/* is there an EOL and/or BOL between lastc and c? */
2494 		flagch = '\0';
2495 		i = 0;
2496 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
2497 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
2498 			flagch = BOL;
2499 			i = m->g->nbol;
2500 		}
2501 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
2502 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
2503 			flagch = (flagch == BOL) ? BOLEOL : EOL;
2504 			i += m->g->neol;
2505 		}
2506 		if (i != 0) {
2507 			for (; i > 0; i--)
2508 				st = step(m->g, startst, stopst, st, flagch, st);
2509 			SP("boleol", st, c);
2510 		}
2511 
2512 		/* how about a word boundary? */
2513 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
2514 					(c != OUT && ISWORD(c)) ) {
2515 			flagch = BOW;
2516 		}
2517 		if ( (lastc != OUT && ISWORD(lastc)) &&
2518 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
2519 			flagch = EOW;
2520 		}
2521 		if (flagch == BOW || flagch == EOW) {
2522 			st = step(m->g, startst, stopst, st, flagch, st);
2523 			SP("boweow", st, c);
2524 		}
2525 
2526 		/* are we done? */
2527 		if (ISSET(st, stopst) || p == stop)
2528 			break;		/* NOTE BREAK OUT */
2529 
2530 		/* no, we must deal with this character */
2531 		ASSIGN(tmp, st);
2532 		ASSIGN(st, fresh);
2533 		assert(c != OUT);
2534 		st = step(m->g, startst, stopst, tmp, c, st);
2535 		SP("aft", st, c);
2536 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
2537 		p++;
2538 	}
2539 
2540 	assert(coldp != NULL);
2541 	m->coldp = coldp;
2542 	if (ISSET(st, stopst))
2543 		return(p+1);
2544 	else
2545 		return(NULL);
2546 }
2547 
2548 /*
2549  - slow - step through the string more deliberately
2550  == static char *slow(register struct match *m, char *start, \
2551  ==	char *stop, sopno startst, sopno stopst);
2552  */
2553 static char *			/* where it ended */
slow(m,start,stop,startst,stopst)2554 slow(m, start, stop, startst, stopst)
2555 register struct match *m;
2556 char *start;
2557 char *stop;
2558 sopno startst;
2559 sopno stopst;
2560 {
2561 	register states st = m->st;
2562 	register states empty = m->empty;
2563 	register states tmp = m->tmp;
2564 	register char *p = start;
2565 	register int c = (start == m->beginp) ? OUT : *(start-1);
2566 	register int lastc;	/* previous c */
2567 	register int flagch;
2568 	register int i;
2569 	register char *matchp;	/* last p at which a match ended */
2570 
2571 	AT("slow", start, stop, startst, stopst);
2572 	CLEAR(st);
2573 	SET1(st, startst);
2574 	SP("sstart", st, *p);
2575 	st = step(m->g, startst, stopst, st, NOTHING, st);
2576 	matchp = NULL;
2577 	for (;;) {
2578 		/* next character */
2579 		lastc = c;
2580 		c = (p == m->endp) ? OUT : *p;
2581 
2582 		/* is there an EOL and/or BOL between lastc and c? */
2583 		flagch = '\0';
2584 		i = 0;
2585 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
2586 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
2587 			flagch = BOL;
2588 			i = m->g->nbol;
2589 		}
2590 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
2591 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
2592 			flagch = (flagch == BOL) ? BOLEOL : EOL;
2593 			i += m->g->neol;
2594 		}
2595 		if (i != 0) {
2596 			for (; i > 0; i--)
2597 				st = step(m->g, startst, stopst, st, flagch, st);
2598 			SP("sboleol", st, c);
2599 		}
2600 
2601 		/* how about a word boundary? */
2602 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
2603 					(c != OUT && ISWORD(c)) ) {
2604 			flagch = BOW;
2605 		}
2606 		if ( (lastc != OUT && ISWORD(lastc)) &&
2607 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
2608 			flagch = EOW;
2609 		}
2610 		if (flagch == BOW || flagch == EOW) {
2611 			st = step(m->g, startst, stopst, st, flagch, st);
2612 			SP("sboweow", st, c);
2613 		}
2614 
2615 		/* are we done? */
2616 		if (ISSET(st, stopst))
2617 			matchp = p;
2618 		if (EQ(st, empty) || p == stop)
2619 			break;		/* NOTE BREAK OUT */
2620 
2621 		/* no, we must deal with this character */
2622 		ASSIGN(tmp, st);
2623 		ASSIGN(st, empty);
2624 		assert(c != OUT);
2625 		st = step(m->g, startst, stopst, tmp, c, st);
2626 		SP("saft", st, c);
2627 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
2628 		p++;
2629 	}
2630 
2631 	return(matchp);
2632 }
2633 
2634 
2635 /*
2636  - step - map set of states reachable before char to set reachable after
2637  == static states step(register struct re_guts *g, sopno start, sopno stop, \
2638  ==	register states bef, int ch, register states aft);
2639  == #define	BOL	(OUT+1)
2640  == #define	EOL	(BOL+1)
2641  == #define	BOLEOL	(BOL+2)
2642  == #define	NOTHING	(BOL+3)
2643  == #define	BOW	(BOL+4)
2644  == #define	EOW	(BOL+5)
2645  == #define	CODEMAX	(BOL+5)		// highest code used
2646  == #define	NONCHAR(c)	((c) > CHAR_MAX)
2647  == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
2648  */
2649 static states
step(g,start,stop,bef,ch,aft)2650 step(g, start, stop, bef, ch, aft)
2651 register struct re_guts *g;
2652 sopno start;			/* start state within strip */
2653 sopno stop;			/* state after stop state within strip */
2654 register states bef;		/* states reachable before */
2655 int ch;				/* character or NONCHAR code */
2656 register states aft;		/* states already known reachable after */
2657 {
2658 	register cset *cs;
2659 	register sop s;
2660 	register sopno pc;
2661 	register onestate here;		/* note, macros know this name */
2662 	register sopno look;
2663 	register long i;
2664 
2665 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
2666 		s = g->strip[pc];
2667 		switch (OP(s)) {
2668 		case OEND:
2669 			assert(pc == stop-1);
2670 			break;
2671 		case OCHAR:
2672 			/* only characters can match */
2673 			assert(!NONCHAR(ch) || ch != (char)OPND(s));
2674 			if (ch == (char)OPND(s))
2675 				FWD(aft, bef, 1);
2676 			break;
2677 		case OBOL:
2678 			if (ch == BOL || ch == BOLEOL)
2679 				FWD(aft, bef, 1);
2680 			break;
2681 		case OEOL:
2682 			if (ch == EOL || ch == BOLEOL)
2683 				FWD(aft, bef, 1);
2684 			break;
2685 		case OBOW:
2686 			if (ch == BOW)
2687 				FWD(aft, bef, 1);
2688 			break;
2689 		case OEOW:
2690 			if (ch == EOW)
2691 				FWD(aft, bef, 1);
2692 			break;
2693 		case OANY:
2694 			if (!NONCHAR(ch))
2695 				FWD(aft, bef, 1);
2696 			break;
2697 		case OANYOF:
2698 			cs = &g->sets[OPND(s)];
2699 			if (!NONCHAR(ch) && CHIN(cs, ch))
2700 				FWD(aft, bef, 1);
2701 			break;
2702 		case OBACK_:		/* ignored here */
2703 		case O_BACK:
2704 			FWD(aft, aft, 1);
2705 			break;
2706 		case OPLUS_:		/* forward, this is just an empty */
2707 			FWD(aft, aft, 1);
2708 			break;
2709 		case O_PLUS:		/* both forward and back */
2710 			FWD(aft, aft, 1);
2711 			i = ISSETBACK(aft, OPND(s));
2712 			BACK(aft, aft, OPND(s));
2713 			if (!i && ISSETBACK(aft, OPND(s))) {
2714 				/* oho, must reconsider loop body */
2715 				pc -= OPND(s) + 1;
2716 				INIT(here, pc);
2717 			}
2718 			break;
2719 		case OQUEST_:		/* two branches, both forward */
2720 			FWD(aft, aft, 1);
2721 			FWD(aft, aft, OPND(s));
2722 			break;
2723 		case O_QUEST:		/* just an empty */
2724 			FWD(aft, aft, 1);
2725 			break;
2726 		case OLPAREN:		/* not significant here */
2727 		case ORPAREN:
2728 			FWD(aft, aft, 1);
2729 			break;
2730 		case OCH_:		/* mark the first two branches */
2731 			FWD(aft, aft, 1);
2732 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
2733 			FWD(aft, aft, OPND(s));
2734 			break;
2735 		case OOR1:		/* done a branch, find the O_CH */
2736 			if (ISSTATEIN(aft, here)) {
2737 				for (look = 1;
2738 						OP(s = g->strip[pc+look]) != O_CH;
2739 						look += OPND(s))
2740 					assert(OP(s) == OOR2);
2741 				FWD(aft, aft, look);
2742 			}
2743 			break;
2744 		case OOR2:		/* propagate OCH_'s marking */
2745 			FWD(aft, aft, 1);
2746 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
2747 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
2748 				FWD(aft, aft, OPND(s));
2749 			}
2750 			break;
2751 		case O_CH:		/* just empty */
2752 			FWD(aft, aft, 1);
2753 			break;
2754 		default:		/* ooooops... */
2755 			assert(nope);
2756 			break;
2757 		}
2758 	}
2759 
2760 	return(aft);
2761 }
2762 
2763 #undef	matcher
2764 #undef	fast
2765 #undef	slow
2766 #undef	dissect
2767 #undef	backref
2768 #undef	step
2769 #undef	print
2770 #undef	at
2771 #undef	match
2772 
2773 /* now undo things */
2774 #undef	states
2775 #undef	CLEAR
2776 #undef	SET0
2777 #undef	SET1
2778 #undef	ISSET
2779 #undef	ASSIGN
2780 #undef	EQ
2781 #undef	STATEVARS
2782 #undef	STATESETUP
2783 #undef	STATETEARDOWN
2784 #undef	SETUP
2785 #undef	onestate
2786 #undef	INIT
2787 #undef	INC
2788 #undef	ISSTATEIN
2789 #undef	FWD
2790 #undef	BACK
2791 #undef	ISSETBACK
2792 #undef	SNAMES
2793 
2794 /* macros for manipulating states, large version */
2795 #define	states	char *
2796 #define	CLEAR(v)	memset(v, 0, m->g->nstates)
2797 #define	SET0(v, n)	((v)[n] = 0)
2798 #define	SET1(v, n)	((v)[n] = 1)
2799 #define	ISSET(v, n)	((v)[n])
2800 #define	ASSIGN(d, s)	memcpy(d, s, m->g->nstates)
2801 #define	EQ(a, b)	(memcmp(a, b, m->g->nstates) == 0)
2802 #define	STATEVARS	int vn; char *space
2803 #define	STATESETUP(m, nv)	{ (m)->space = malloc((nv)*(m)->g->nstates); \
2804 				if ((m)->space == NULL) return(REG_ESPACE); \
2805 				(m)->vn = 0; }
2806 #define	STATETEARDOWN(m)	{ free((m)->space); }
2807 #define	SETUP(v)	((v) = &m->space[m->vn++ * m->g->nstates])
2808 #define	onestate	int
2809 #define	INIT(o, n)	((o) = (n))
2810 #define	INC(o)	((o)++)
2811 #define	ISSTATEIN(v, o)	((v)[o])
2812 /* some abbreviations; note that some of these know variable names! */
2813 /* do "if I'm here, I can also be there" etc without branches */
2814 #define	FWD(dst, src, n)	((dst)[here+(n)] |= (src)[here])
2815 #define	BACK(dst, src, n)	((dst)[here-(n)] |= (src)[here])
2816 #define	ISSETBACK(v, n)	((v)[here - (n)])
2817 /* function names */
2818 #define	LNAMES			/* flag */
2819 
2820 /*
2821  * The matching engine and friends.  This file is #included by regexec.c
2822  * after suitable #defines of a variety of macros used herein, so that
2823  * different state representations can be used without duplicating masses
2824  * of code.
2825  */
2826 
2827 #ifdef SNAMES
2828 #define	matcher	smatcher
2829 #define	fast	sfast
2830 #define	slow	sslow
2831 #define	dissect	sdissect
2832 #define	backref	sbackref
2833 #define	step	sstep
2834 #define	print	sprint
2835 #define	at	sat
2836 #define	match	smat
2837 #endif
2838 #ifdef LNAMES
2839 #define	matcher	lmatcher
2840 #define	fast	lfast
2841 #define	slow	lslow
2842 #define	dissect	ldissect
2843 #define	backref	lbackref
2844 #define	step	lstep
2845 #define	print	lprint
2846 #define	at	lat
2847 #define	match	lmat
2848 #endif
2849 
2850 /* another structure passed up and down to avoid zillions of parameters */
2851 struct match {
2852 	struct re_guts *g;
2853 	int eflags;
2854 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
2855 	char *offp;		/* offsets work from here */
2856 	char *beginp;		/* start of string -- virtual NUL precedes */
2857 	char *endp;		/* end of string -- virtual NUL here */
2858 	char *coldp;		/* can be no match starting before here */
2859 	char **lastpos;		/* [nplus+1] */
2860 	STATEVARS;
2861 	states st;		/* current states */
2862 	states fresh;		/* states for a fresh start */
2863 	states tmp;		/* temporary */
2864 	states empty;		/* empty set of states */
2865 };
2866 
2867 static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
2868 static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2869 static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
2870 static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2871 static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
2872 static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft);
2873 #define	BOL	(OUT+1)
2874 #define	EOL	(BOL+1)
2875 #define	BOLEOL	(BOL+2)
2876 #define	NOTHING	(BOL+3)
2877 #define	BOW	(BOL+4)
2878 #define	EOW	(BOL+5)
2879 #define	CODEMAX	(BOL+5)		/* highest code used */
2880 #define	NONCHAR(c)	((c) > CHAR_MAX)
2881 #define	NNONCHAR	(CODEMAX-CHAR_MAX)
2882 #define	SP(t, s, c)	/* nothing */
2883 #define	AT(t, p1, p2, s1, s2)	/* nothing */
2884 #define	NOTE(s)	/* nothing */
2885 
2886 /*
2887  - matcher - the actual matching engine
2888  == static int matcher(register struct re_guts *g, char *string, \
2889  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
2890  */
2891 static int			/* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)2892 matcher(g, string, nmatch, pmatch, eflags)
2893 register struct re_guts *g;
2894 char *string;
2895 size_t nmatch;
2896 regmatch_t pmatch[];
2897 int eflags;
2898 {
2899 	register char *endp;
2900 	register int i;
2901 	struct match mv;
2902 	register struct match *m = &mv;
2903 	register char *dp;
2904 	const register sopno gf = g->firststate+1;	/* +1 for OEND */
2905 	const register sopno gl = g->laststate;
2906 	char *start;
2907 	char *stop;
2908 
2909 	/* simplify the situation where possible */
2910 	if (g->cflags&REG_NOSUB)
2911 		nmatch = 0;
2912 	if (eflags&REG_STARTEND) {
2913 		start = string + pmatch[0].rm_so;
2914 		stop = string + pmatch[0].rm_eo;
2915 	} else {
2916 		start = string;
2917 		stop = start + strlen(start);
2918 	}
2919 	if (stop < start)
2920 		return(REG_INVARG);
2921 
2922 	/* prescreening; this does wonders for this rather slow code */
2923 	if (g->must != NULL) {
2924 		for (dp = start; dp < stop; dp++)
2925 			if (*dp == g->must[0] && stop - dp >= g->mlen &&
2926 				memcmp(dp, g->must, (size_t)g->mlen) == 0)
2927 				break;
2928 		if (dp == stop)		/* we didn't find g->must */
2929 			return(REG_NOMATCH);
2930 	}
2931 
2932 	/* match struct setup */
2933 	m->g = g;
2934 	m->eflags = eflags;
2935 	m->pmatch = NULL;
2936 	m->lastpos = NULL;
2937 	m->offp = string;
2938 	m->beginp = start;
2939 	m->endp = stop;
2940 	STATESETUP(m, 4);
2941 	SETUP(m->st);
2942 	SETUP(m->fresh);
2943 	SETUP(m->tmp);
2944 	SETUP(m->empty);
2945 	CLEAR(m->empty);
2946 
2947 	/* this loop does only one repetition except for backrefs */
2948 	for (;;) {
2949 		endp = fast(m, start, stop, gf, gl);
2950 		if (endp == NULL) {		/* a miss */
2951 			STATETEARDOWN(m);
2952 			return(REG_NOMATCH);
2953 		}
2954 		if (nmatch == 0 && !g->backrefs)
2955 			break;		/* no further info needed */
2956 
2957 		/* where? */
2958 		assert(m->coldp != NULL);
2959 		for (;;) {
2960 			NOTE("finding start");
2961 			endp = slow(m, m->coldp, stop, gf, gl);
2962 			if (endp != NULL)
2963 				break;
2964 			assert(m->coldp < m->endp);
2965 			m->coldp++;
2966 		}
2967 		if (nmatch == 1 && !g->backrefs)
2968 			break;		/* no further info needed */
2969 
2970 		/* oh my, he wants the subexpressions... */
2971 		if (m->pmatch == NULL)
2972 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
2973 							sizeof(regmatch_t));
2974 		if (m->pmatch == NULL) {
2975 			STATETEARDOWN(m);
2976 			return(REG_ESPACE);
2977 		}
2978 		for (i = 1; i <= m->g->nsub; i++)
2979 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
2980 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
2981 			NOTE("dissecting");
2982 			dp = dissect(m, m->coldp, endp, gf, gl);
2983 		} else {
2984 			if (g->nplus > 0 && m->lastpos == NULL)
2985 				m->lastpos = (char **)malloc((g->nplus+1) *
2986 							sizeof(char *));
2987 			if (g->nplus > 0 && m->lastpos == NULL) {
2988 				free(m->pmatch);
2989 				STATETEARDOWN(m);
2990 				return(REG_ESPACE);
2991 			}
2992 			NOTE("backref dissect");
2993 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
2994 		}
2995 		if (dp != NULL)
2996 			break;
2997 
2998 		/* uh-oh... we couldn't find a subexpression-level match */
2999 		assert(g->backrefs);	/* must be back references doing it */
3000 		assert(g->nplus == 0 || m->lastpos != NULL);
3001 		for (;;) {
3002 			if (dp != NULL || endp <= m->coldp)
3003 				break;		/* defeat */
3004 			NOTE("backoff");
3005 			endp = slow(m, m->coldp, endp-1, gf, gl);
3006 			if (endp == NULL)
3007 				break;		/* defeat */
3008 			/* try it on a shorter possibility */
3009 			NOTE("backoff dissect");
3010 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
3011 		}
3012 		assert(dp == NULL || dp == endp);
3013 		if (dp != NULL)		/* found a shorter one */
3014 			break;
3015 
3016 		/* despite initial appearances, there is no match here */
3017 		NOTE("false alarm");
3018 		start = m->coldp + 1;	/* recycle starting later */
3019 		assert(start <= stop);
3020 	}
3021 
3022 	/* fill in the details if requested */
3023 	if (nmatch > 0) {
3024 		pmatch[0].rm_so = m->coldp - m->offp;
3025 		pmatch[0].rm_eo = endp - m->offp;
3026 	}
3027 	if (nmatch > 1) {
3028 		assert(m->pmatch != NULL);
3029 		for (i = 1; i < nmatch; i++)
3030 			if (i <= m->g->nsub)
3031 				pmatch[i] = m->pmatch[i];
3032 			else {
3033 				pmatch[i].rm_so = -1;
3034 				pmatch[i].rm_eo = -1;
3035 			}
3036 	}
3037 
3038 	if (m->pmatch != NULL)
3039 		free((char *)m->pmatch);
3040 	if (m->lastpos != NULL)
3041 		free((char *)m->lastpos);
3042 	STATETEARDOWN(m);
3043 	return(0);
3044 }
3045 
3046 /*
3047  - dissect - figure out what matched what, no back references
3048  == static char *dissect(register struct match *m, char *start, \
3049  ==	char *stop, sopno startst, sopno stopst);
3050  */
3051 static char *			/* == stop (success) always */
dissect(m,start,stop,startst,stopst)3052 dissect(m, start, stop, startst, stopst)
3053 register struct match *m;
3054 char *start;
3055 char *stop;
3056 sopno startst;
3057 sopno stopst;
3058 {
3059 	register int i;
3060 	register sopno ss;	/* start sop of current subRE */
3061 	register sopno es;	/* end sop of current subRE */
3062 	register char *sp;	/* start of string matched by it */
3063 	register char *stp;	/* string matched by it cannot pass here */
3064 	register char *rest;	/* start of rest of string */
3065 	register char *tail;	/* string unmatched by rest of RE */
3066 	register sopno ssub;	/* start sop of subsubRE */
3067 	register sopno esub;	/* end sop of subsubRE */
3068 	register char *ssp;	/* start of string matched by subsubRE */
3069 	register char *sep;	/* end of string matched by subsubRE */
3070 	register char *oldssp;	/* previous ssp */
3071 	register char *dp;
3072 
3073 	AT("diss", start, stop, startst, stopst);
3074 	sp = start;
3075 	for (ss = startst; ss < stopst; ss = es) {
3076 		/* identify end of subRE */
3077 		es = ss;
3078 		switch (OP(m->g->strip[es])) {
3079 		case OPLUS_:
3080 		case OQUEST_:
3081 			es += OPND(m->g->strip[es]);
3082 			break;
3083 		case OCH_:
3084 			while (OP(m->g->strip[es]) != O_CH)
3085 				es += OPND(m->g->strip[es]);
3086 			break;
3087 		}
3088 		es++;
3089 
3090 		/* figure out what it matched */
3091 		switch (OP(m->g->strip[ss])) {
3092 		case OEND:
3093 			assert(nope);
3094 			break;
3095 		case OCHAR:
3096 			sp++;
3097 			break;
3098 		case OBOL:
3099 		case OEOL:
3100 		case OBOW:
3101 		case OEOW:
3102 			break;
3103 		case OANY:
3104 		case OANYOF:
3105 			sp++;
3106 			break;
3107 		case OBACK_:
3108 		case O_BACK:
3109 			assert(nope);
3110 			break;
3111 		/* cases where length of match is hard to find */
3112 		case OQUEST_:
3113 			stp = stop;
3114 			for (;;) {
3115 				/* how long could this one be? */
3116 				rest = slow(m, sp, stp, ss, es);
3117 				assert(rest != NULL);	/* it did match */
3118 				/* could the rest match the rest? */
3119 				tail = slow(m, rest, stop, es, stopst);
3120 				if (tail == stop)
3121 					break;		/* yes! */
3122 				/* no -- try a shorter match for this one */
3123 				stp = rest - 1;
3124 				assert(stp >= sp);	/* it did work */
3125 			}
3126 			ssub = ss + 1;
3127 			esub = es - 1;
3128 			/* did innards match? */
3129 			if (slow(m, sp, rest, ssub, esub) != NULL) {
3130 				dp = dissect(m, sp, rest, ssub, esub);
3131 				assert(dp == rest);
3132 			} else		/* no */
3133 				assert(sp == rest);
3134 			sp = rest;
3135 			break;
3136 		case OPLUS_:
3137 			stp = stop;
3138 			for (;;) {
3139 				/* how long could this one be? */
3140 				rest = slow(m, sp, stp, ss, es);
3141 				assert(rest != NULL);	/* it did match */
3142 				/* could the rest match the rest? */
3143 				tail = slow(m, rest, stop, es, stopst);
3144 				if (tail == stop)
3145 					break;		/* yes! */
3146 				/* no -- try a shorter match for this one */
3147 				stp = rest - 1;
3148 				assert(stp >= sp);	/* it did work */
3149 			}
3150 			ssub = ss + 1;
3151 			esub = es - 1;
3152 			ssp = sp;
3153 			oldssp = ssp;
3154 			for (;;) {	/* find last match of innards */
3155 				sep = slow(m, ssp, rest, ssub, esub);
3156 				if (sep == NULL || sep == ssp)
3157 					break;	/* failed or matched null */
3158 				oldssp = ssp;	/* on to next try */
3159 				ssp = sep;
3160 			}
3161 			if (sep == NULL) {
3162 				/* last successful match */
3163 				sep = ssp;
3164 				ssp = oldssp;
3165 			}
3166 			assert(sep == rest);	/* must exhaust substring */
3167 			assert(slow(m, ssp, sep, ssub, esub) == rest);
3168 			dp = dissect(m, ssp, sep, ssub, esub);
3169 			assert(dp == sep);
3170 			sp = rest;
3171 			break;
3172 		case OCH_:
3173 			stp = stop;
3174 			for (;;) {
3175 				/* how long could this one be? */
3176 				rest = slow(m, sp, stp, ss, es);
3177 				assert(rest != NULL);	/* it did match */
3178 				/* could the rest match the rest? */
3179 				tail = slow(m, rest, stop, es, stopst);
3180 				if (tail == stop)
3181 					break;		/* yes! */
3182 				/* no -- try a shorter match for this one */
3183 				stp = rest - 1;
3184 				assert(stp >= sp);	/* it did work */
3185 			}
3186 			ssub = ss + 1;
3187 			esub = ss + OPND(m->g->strip[ss]) - 1;
3188 			assert(OP(m->g->strip[esub]) == OOR1);
3189 			for (;;) {	/* find first matching branch */
3190 				if (slow(m, sp, rest, ssub, esub) == rest)
3191 					break;	/* it matched all of it */
3192 				/* that one missed, try next one */
3193 				assert(OP(m->g->strip[esub]) == OOR1);
3194 				esub++;
3195 				assert(OP(m->g->strip[esub]) == OOR2);
3196 				ssub = esub + 1;
3197 				esub += OPND(m->g->strip[esub]);
3198 				if (OP(m->g->strip[esub]) == OOR2)
3199 					esub--;
3200 				else
3201 					assert(OP(m->g->strip[esub]) == O_CH);
3202 			}
3203 			dp = dissect(m, sp, rest, ssub, esub);
3204 			assert(dp == rest);
3205 			sp = rest;
3206 			break;
3207 		case O_PLUS:
3208 		case O_QUEST:
3209 		case OOR1:
3210 		case OOR2:
3211 		case O_CH:
3212 			assert(nope);
3213 			break;
3214 		case OLPAREN:
3215 			i = OPND(m->g->strip[ss]);
3216 			assert(0 < i && i <= m->g->nsub);
3217 			m->pmatch[i].rm_so = sp - m->offp;
3218 			break;
3219 		case ORPAREN:
3220 			i = OPND(m->g->strip[ss]);
3221 			assert(0 < i && i <= m->g->nsub);
3222 			m->pmatch[i].rm_eo = sp - m->offp;
3223 			break;
3224 		default:		/* uh oh */
3225 			assert(nope);
3226 			break;
3227 		}
3228 	}
3229 
3230 	assert(sp == stop);
3231 	return(sp);
3232 }
3233 
3234 /*
3235  - backref - figure out what matched what, figuring in back references
3236  == static char *backref(register struct match *m, char *start, \
3237  ==	char *stop, sopno startst, sopno stopst, sopno lev);
3238  */
3239 static char *			/* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)3240 backref(m, start, stop, startst, stopst, lev)
3241 register struct match *m;
3242 char *start;
3243 char *stop;
3244 sopno startst;
3245 sopno stopst;
3246 sopno lev;			/* PLUS nesting level */
3247 {
3248 	register int i;
3249 	register sopno ss;	/* start sop of current subRE */
3250 	register char *sp;	/* start of string matched by it */
3251 	register sopno ssub;	/* start sop of subsubRE */
3252 	register sopno esub;	/* end sop of subsubRE */
3253 	register char *ssp;	/* start of string matched by subsubRE */
3254 	register char *dp;
3255 	register size_t len;
3256 	register int hard;
3257 	register sop s;
3258 	register regoff_t offsave;
3259 	register cset *cs;
3260 
3261 	AT("back", start, stop, startst, stopst);
3262 	sp = start;
3263 
3264 	/* get as far as we can with easy stuff */
3265 	hard = 0;
3266 	for (ss = startst; !hard && ss < stopst; ss++)
3267 		switch (OP(s = m->g->strip[ss])) {
3268 		case OCHAR:
3269 			if (sp == stop || *sp++ != (char)OPND(s))
3270 				return(NULL);
3271 			break;
3272 		case OANY:
3273 			if (sp == stop)
3274 				return(NULL);
3275 			sp++;
3276 			break;
3277 		case OANYOF:
3278 			cs = &m->g->sets[OPND(s)];
3279 			if (sp == stop || !CHIN(cs, *sp++))
3280 				return(NULL);
3281 			break;
3282 		case OBOL:
3283 			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
3284 					(sp < m->endp && *(sp-1) == '\n' &&
3285 						(m->g->cflags&REG_NEWLINE)) )
3286 				{ /* yes */ }
3287 			else
3288 				return(NULL);
3289 			break;
3290 		case OEOL:
3291 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
3292 					(sp < m->endp && *sp == '\n' &&
3293 						(m->g->cflags&REG_NEWLINE)) )
3294 				{ /* yes */ }
3295 			else
3296 				return(NULL);
3297 			break;
3298 		case OBOW:
3299 			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
3300 					(sp < m->endp && *(sp-1) == '\n' &&
3301 						(m->g->cflags&REG_NEWLINE)) ||
3302 					(sp > m->beginp &&
3303 							!ISWORD(*(sp-1))) ) &&
3304 					(sp < m->endp && ISWORD(*sp)) )
3305 				{ /* yes */ }
3306 			else
3307 				return(NULL);
3308 			break;
3309 		case OEOW:
3310 			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
3311 					(sp < m->endp && *sp == '\n' &&
3312 						(m->g->cflags&REG_NEWLINE)) ||
3313 					(sp < m->endp && !ISWORD(*sp)) ) &&
3314 					(sp > m->beginp && ISWORD(*(sp-1))) )
3315 				{ /* yes */ }
3316 			else
3317 				return(NULL);
3318 			break;
3319 		case O_QUEST:
3320 			break;
3321 		case OOR1:	/* matches null but needs to skip */
3322 			ss++;
3323 			s = m->g->strip[ss];
3324 			do {
3325 				assert(OP(s) == OOR2);
3326 				ss += OPND(s);
3327 			} while (OP(s = m->g->strip[ss]) != O_CH);
3328 			/* note that the ss++ gets us past the O_CH */
3329 			break;
3330 		default:	/* have to make a choice */
3331 			hard = 1;
3332 			break;
3333 		}
3334 	if (!hard) {		/* that was it! */
3335 		if (sp != stop)
3336 			return(NULL);
3337 		return(sp);
3338 	}
3339 	ss--;			/* adjust for the for's final increment */
3340 
3341 	/* the hard stuff */
3342 	AT("hard", sp, stop, ss, stopst);
3343 	s = m->g->strip[ss];
3344 	switch (OP(s)) {
3345 	case OBACK_:		/* the vilest depths */
3346 		i = OPND(s);
3347 		assert(0 < i && i <= m->g->nsub);
3348 		if (m->pmatch[i].rm_eo == -1)
3349 			return(NULL);
3350 		assert(m->pmatch[i].rm_so != -1);
3351 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
3352 		assert(stop - m->beginp >= len);
3353 		if (sp > stop - len)
3354 			return(NULL);	/* not enough left to match */
3355 		ssp = m->offp + m->pmatch[i].rm_so;
3356 		if (memcmp(sp, ssp, len) != 0)
3357 			return(NULL);
3358 		while (m->g->strip[ss] != SOP(O_BACK, i))
3359 			ss++;
3360 		return(backref(m, sp+len, stop, ss+1, stopst, lev));
3361 		break;
3362 	case OQUEST_:		/* to null or not */
3363 		dp = backref(m, sp, stop, ss+1, stopst, lev);
3364 		if (dp != NULL)
3365 			return(dp);	/* not */
3366 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
3367 		break;
3368 	case OPLUS_:
3369 		assert(m->lastpos != NULL);
3370 		assert(lev+1 <= m->g->nplus);
3371 		m->lastpos[lev+1] = sp;
3372 		return(backref(m, sp, stop, ss+1, stopst, lev+1));
3373 		break;
3374 	case O_PLUS:
3375 		if (sp == m->lastpos[lev])	/* last pass matched null */
3376 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
3377 		/* try another pass */
3378 		m->lastpos[lev] = sp;
3379 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
3380 		if (dp == NULL)
3381 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
3382 		else
3383 			return(dp);
3384 		break;
3385 	case OCH_:		/* find the right one, if any */
3386 		ssub = ss + 1;
3387 		esub = ss + OPND(s) - 1;
3388 		assert(OP(m->g->strip[esub]) == OOR1);
3389 		for (;;) {	/* find first matching branch */
3390 			dp = backref(m, sp, stop, ssub, esub, lev);
3391 			if (dp != NULL)
3392 				return(dp);
3393 			/* that one missed, try next one */
3394 			if (OP(m->g->strip[esub]) == O_CH)
3395 				return(NULL);	/* there is none */
3396 			esub++;
3397 			assert(OP(m->g->strip[esub]) == OOR2);
3398 			ssub = esub + 1;
3399 			esub += OPND(m->g->strip[esub]);
3400 			if (OP(m->g->strip[esub]) == OOR2)
3401 				esub--;
3402 			else
3403 				assert(OP(m->g->strip[esub]) == O_CH);
3404 		}
3405 		break;
3406 	case OLPAREN:		/* must undo assignment if rest fails */
3407 		i = OPND(s);
3408 		assert(0 < i && i <= m->g->nsub);
3409 		offsave = m->pmatch[i].rm_so;
3410 		m->pmatch[i].rm_so = sp - m->offp;
3411 		dp = backref(m, sp, stop, ss+1, stopst, lev);
3412 		if (dp != NULL)
3413 			return(dp);
3414 		m->pmatch[i].rm_so = offsave;
3415 		return(NULL);
3416 		break;
3417 	case ORPAREN:		/* must undo assignment if rest fails */
3418 		i = OPND(s);
3419 		assert(0 < i && i <= m->g->nsub);
3420 		offsave = m->pmatch[i].rm_eo;
3421 		m->pmatch[i].rm_eo = sp - m->offp;
3422 		dp = backref(m, sp, stop, ss+1, stopst, lev);
3423 		if (dp != NULL)
3424 			return(dp);
3425 		m->pmatch[i].rm_eo = offsave;
3426 		return(NULL);
3427 		break;
3428 	default:		/* uh oh */
3429 		assert(nope);
3430 		break;
3431 	}
3432 
3433 	/* "can't happen" */
3434 	assert(nope);
3435 	/* NOTREACHED */
3436 	return((char *)NULL);	/* dummy */
3437 }
3438 
3439 /*
3440  - fast - step through the string at top speed
3441  == static char *fast(register struct match *m, char *start, \
3442  ==	char *stop, sopno startst, sopno stopst);
3443  */
3444 static char *			/* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)3445 fast(m, start, stop, startst, stopst)
3446 register struct match *m;
3447 char *start;
3448 char *stop;
3449 sopno startst;
3450 sopno stopst;
3451 {
3452 	register states st = m->st;
3453 	register states fresh = m->fresh;
3454 	register states tmp = m->tmp;
3455 	register char *p = start;
3456 	register int c = (start == m->beginp) ? OUT : *(start-1);
3457 	register int lastc;	/* previous c */
3458 	register int flagch;
3459 	register int i;
3460 	register char *coldp;	/* last p after which no match was underway */
3461 
3462 	CLEAR(st);
3463 	SET1(st, startst);
3464 	st = step(m->g, startst, stopst, st, NOTHING, st);
3465 	ASSIGN(fresh, st);
3466 	SP("start", st, *p);
3467 	coldp = NULL;
3468 	for (;;) {
3469 		/* next character */
3470 		lastc = c;
3471 		c = (p == m->endp) ? OUT : *p;
3472 		if (EQ(st, fresh))
3473 			coldp = p;
3474 
3475 		/* is there an EOL and/or BOL between lastc and c? */
3476 		flagch = '\0';
3477 		i = 0;
3478 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
3479 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
3480 			flagch = BOL;
3481 			i = m->g->nbol;
3482 		}
3483 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
3484 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
3485 			flagch = (flagch == BOL) ? BOLEOL : EOL;
3486 			i += m->g->neol;
3487 		}
3488 		if (i != 0) {
3489 			for (; i > 0; i--)
3490 				st = step(m->g, startst, stopst, st, flagch, st);
3491 			SP("boleol", st, c);
3492 		}
3493 
3494 		/* how about a word boundary? */
3495 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
3496 					(c != OUT && ISWORD(c)) ) {
3497 			flagch = BOW;
3498 		}
3499 		if ( (lastc != OUT && ISWORD(lastc)) &&
3500 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
3501 			flagch = EOW;
3502 		}
3503 		if (flagch == BOW || flagch == EOW) {
3504 			st = step(m->g, startst, stopst, st, flagch, st);
3505 			SP("boweow", st, c);
3506 		}
3507 
3508 		/* are we done? */
3509 		if (ISSET(st, stopst) || p == stop)
3510 			break;		/* NOTE BREAK OUT */
3511 
3512 		/* no, we must deal with this character */
3513 		ASSIGN(tmp, st);
3514 		ASSIGN(st, fresh);
3515 		assert(c != OUT);
3516 		st = step(m->g, startst, stopst, tmp, c, st);
3517 		SP("aft", st, c);
3518 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
3519 		p++;
3520 	}
3521 
3522 	assert(coldp != NULL);
3523 	m->coldp = coldp;
3524 	if (ISSET(st, stopst))
3525 		return(p+1);
3526 	else
3527 		return(NULL);
3528 }
3529 
3530 /*
3531  - slow - step through the string more deliberately
3532  == static char *slow(register struct match *m, char *start, \
3533  ==	char *stop, sopno startst, sopno stopst);
3534  */
3535 static char *			/* where it ended */
slow(m,start,stop,startst,stopst)3536 slow(m, start, stop, startst, stopst)
3537 register struct match *m;
3538 char *start;
3539 char *stop;
3540 sopno startst;
3541 sopno stopst;
3542 {
3543 	register states st = m->st;
3544 	register states empty = m->empty;
3545 	register states tmp = m->tmp;
3546 	register char *p = start;
3547 	register int c = (start == m->beginp) ? OUT : *(start-1);
3548 	register int lastc;	/* previous c */
3549 	register int flagch;
3550 	register int i;
3551 	register char *matchp;	/* last p at which a match ended */
3552 
3553 	AT("slow", start, stop, startst, stopst);
3554 	CLEAR(st);
3555 	SET1(st, startst);
3556 	SP("sstart", st, *p);
3557 	st = step(m->g, startst, stopst, st, NOTHING, st);
3558 	matchp = NULL;
3559 	for (;;) {
3560 		/* next character */
3561 		lastc = c;
3562 		c = (p == m->endp) ? OUT : *p;
3563 
3564 		/* is there an EOL and/or BOL between lastc and c? */
3565 		flagch = '\0';
3566 		i = 0;
3567 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
3568 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
3569 			flagch = BOL;
3570 			i = m->g->nbol;
3571 		}
3572 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
3573 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
3574 			flagch = (flagch == BOL) ? BOLEOL : EOL;
3575 			i += m->g->neol;
3576 		}
3577 		if (i != 0) {
3578 			for (; i > 0; i--)
3579 				st = step(m->g, startst, stopst, st, flagch, st);
3580 			SP("sboleol", st, c);
3581 		}
3582 
3583 		/* how about a word boundary? */
3584 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
3585 					(c != OUT && ISWORD(c)) ) {
3586 			flagch = BOW;
3587 		}
3588 		if ( (lastc != OUT && ISWORD(lastc)) &&
3589 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
3590 			flagch = EOW;
3591 		}
3592 		if (flagch == BOW || flagch == EOW) {
3593 			st = step(m->g, startst, stopst, st, flagch, st);
3594 			SP("sboweow", st, c);
3595 		}
3596 
3597 		/* are we done? */
3598 		if (ISSET(st, stopst))
3599 			matchp = p;
3600 		if (EQ(st, empty) || p == stop)
3601 			break;		/* NOTE BREAK OUT */
3602 
3603 		/* no, we must deal with this character */
3604 		ASSIGN(tmp, st);
3605 		ASSIGN(st, empty);
3606 		assert(c != OUT);
3607 		st = step(m->g, startst, stopst, tmp, c, st);
3608 		SP("saft", st, c);
3609 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
3610 		p++;
3611 	}
3612 
3613 	return(matchp);
3614 }
3615 
3616 
3617 static states
step(g,start,stop,bef,ch,aft)3618 step(g, start, stop, bef, ch, aft)
3619 register struct re_guts *g;
3620 sopno start;			/* start state within strip */
3621 sopno stop;			/* state after stop state within strip */
3622 register states bef;		/* states reachable before */
3623 int ch;				/* character or NONCHAR code */
3624 register states aft;		/* states already known reachable after */
3625 {
3626 	register cset *cs;
3627 	register sop s;
3628 	register sopno pc;
3629 	register onestate here;		/* note, macros know this name */
3630 	register sopno look;
3631 	register long i;
3632 
3633 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
3634 		s = g->strip[pc];
3635 		switch (OP(s)) {
3636 		case OEND:
3637 			assert(pc == stop-1);
3638 			break;
3639 		case OCHAR:
3640 			/* only characters can match */
3641 			assert(!NONCHAR(ch) || ch != (char)OPND(s));
3642 			if (ch == (char)OPND(s))
3643 				FWD(aft, bef, 1);
3644 			break;
3645 		case OBOL:
3646 			if (ch == BOL || ch == BOLEOL)
3647 				FWD(aft, bef, 1);
3648 			break;
3649 		case OEOL:
3650 			if (ch == EOL || ch == BOLEOL)
3651 				FWD(aft, bef, 1);
3652 			break;
3653 		case OBOW:
3654 			if (ch == BOW)
3655 				FWD(aft, bef, 1);
3656 			break;
3657 		case OEOW:
3658 			if (ch == EOW)
3659 				FWD(aft, bef, 1);
3660 			break;
3661 		case OANY:
3662 			if (!NONCHAR(ch))
3663 				FWD(aft, bef, 1);
3664 			break;
3665 		case OANYOF:
3666 			cs = &g->sets[OPND(s)];
3667 			if (!NONCHAR(ch) && CHIN(cs, ch))
3668 				FWD(aft, bef, 1);
3669 			break;
3670 		case OBACK_:		/* ignored here */
3671 		case O_BACK:
3672 			FWD(aft, aft, 1);
3673 			break;
3674 		case OPLUS_:		/* forward, this is just an empty */
3675 			FWD(aft, aft, 1);
3676 			break;
3677 		case O_PLUS:		/* both forward and back */
3678 			FWD(aft, aft, 1);
3679 			i = ISSETBACK(aft, OPND(s));
3680 			BACK(aft, aft, OPND(s));
3681 			if (!i && ISSETBACK(aft, OPND(s))) {
3682 				/* oho, must reconsider loop body */
3683 				pc -= OPND(s) + 1;
3684 				INIT(here, pc);
3685 			}
3686 			break;
3687 		case OQUEST_:		/* two branches, both forward */
3688 			FWD(aft, aft, 1);
3689 			FWD(aft, aft, OPND(s));
3690 			break;
3691 		case O_QUEST:		/* just an empty */
3692 			FWD(aft, aft, 1);
3693 			break;
3694 		case OLPAREN:		/* not significant here */
3695 		case ORPAREN:
3696 			FWD(aft, aft, 1);
3697 			break;
3698 		case OCH_:		/* mark the first two branches */
3699 			FWD(aft, aft, 1);
3700 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
3701 			FWD(aft, aft, OPND(s));
3702 			break;
3703 		case OOR1:		/* done a branch, find the O_CH */
3704 			if (ISSTATEIN(aft, here)) {
3705 				for (look = 1;
3706 						OP(s = g->strip[pc+look]) != O_CH;
3707 						look += OPND(s))
3708 					assert(OP(s) == OOR2);
3709 				FWD(aft, aft, look);
3710 			}
3711 			break;
3712 		case OOR2:		/* propagate OCH_'s marking */
3713 			FWD(aft, aft, 1);
3714 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
3715 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
3716 				FWD(aft, aft, OPND(s));
3717 			}
3718 			break;
3719 		case O_CH:		/* just empty */
3720 			FWD(aft, aft, 1);
3721 			break;
3722 		default:		/* ooooops... */
3723 			assert(nope);
3724 			break;
3725 		}
3726 	}
3727 
3728 	return(aft);
3729 }
3730 
3731 #undef	matcher
3732 #undef	fast
3733 #undef	slow
3734 #undef	dissect
3735 #undef	backref
3736 #undef	step
3737 #undef	print
3738 #undef	at
3739 #undef	match
3740 
3741 int				/* 0 success, REG_NOMATCH failure */
regexec(preg,string,nmatch,pmatch,eflags)3742 regexec(preg, string, nmatch, pmatch, eflags)
3743 const regex_t *preg;
3744 const char *string;
3745 size_t nmatch;
3746 regmatch_t pmatch[];
3747 int eflags;
3748 {
3749 	register struct re_guts *g = preg->re_g;
3750 #define	GOODFLAGS(f)	((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
3751 
3752 	if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
3753 		return(REG_BADPAT);
3754 	assert(!(g->iflags&BAD));
3755 	if (g->iflags&BAD)		/* backstop for no-debug case */
3756 		return(REG_BADPAT);
3757 	eflags = GOODFLAGS(eflags);
3758 
3759 	if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
3760 		return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
3761 	else
3762 		return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));
3763 #undef GOODFLAGS
3764 }
3765 /*
3766  - regfree - free everything
3767  = extern void regfree(regex_t *);
3768  */
3769 void
regfree(preg)3770 regfree(preg)
3771 regex_t *preg;
3772 {
3773 	register struct re_guts *g;
3774 
3775 	if (preg->re_magic != MAGIC1)	/* oops */
3776 		return;			/* nice to complain, but hard */
3777 
3778 	g = preg->re_g;
3779 	if (g == NULL || g->magic != MAGIC2)	/* oops again */
3780 		return;
3781 	preg->re_magic = 0;		/* mark it invalid */
3782 	g->magic = 0;			/* mark it invalid */
3783 
3784 	if (g->strip != NULL)
3785 		free((char *)g->strip);
3786 	if (g->sets != NULL)
3787 		free((char *)g->sets);
3788 	if (g->setbits != NULL)
3789 		free((char *)g->setbits);
3790 	if (g->must != NULL)
3791 		free(g->must);
3792 	free((char *)g);
3793 }
3794 
3795 #ifdef WITH_MAIN
main(int argc,char * argv[])3796 int main(int argc, char* argv[]){
3797  regex_t preg;
3798  int i, s;
3799  if(argc<2) return 1;
3800  if (regcomp(&preg, argv[1], REG_NOSUB)) {
3801 	fprintf(stderr, "Failed to compile regex\n");
3802 	return 2;
3803  }
3804  for (i = 2; i<argc; i++){
3805    s = regexec(&preg, argv[i], 0, NULL, 0);
3806    fprintf(stdout, "String[%d]: ", i-1);
3807    switch(s){
3808 	case 0:
3809 		printf("match\n");
3810 		break;
3811 	case REG_NOMATCH:
3812 		printf("not match\n");
3813 		break;
3814 	default:
3815 		printf("failed (%d)\n", s);
3816    }
3817  }
3818  regfree(&preg);
3819  return 0;
3820 }
3821 #endif
3822