1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6 
7 #include "regexp.h"
8 #include "utf.h"
9 
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13 
14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15 
16 #define REPINF 255
17 #ifndef REG_MAXPROG
18 #define REG_MAXPROG (32 << 10)
19 #endif
20 #ifndef REG_MAXREC
21 #define REG_MAXREC 1024
22 #endif
23 #ifndef REG_MAXSPAN
24 #define REG_MAXSPAN 64
25 #endif
26 #ifndef REG_MAXCLASS
27 #define REG_MAXCLASS 16
28 #endif
29 
30 typedef struct Reclass Reclass;
31 typedef struct Renode Renode;
32 typedef struct Reinst Reinst;
33 typedef struct Rethread Rethread;
34 
35 struct Reclass {
36 	Rune *end;
37 	Rune spans[REG_MAXSPAN];
38 };
39 
40 struct Reprog {
41 	Reinst *start, *end;
42 	int flags;
43 	int nsub;
44 	Reclass cclass[REG_MAXCLASS];
45 };
46 
47 struct cstate {
48 	Reprog *prog;
49 	Renode *pstart, *pend;
50 
51 	const char *source;
52 	int ncclass;
53 	int nsub;
54 	Renode *sub[REG_MAXSUB];
55 
56 	int lookahead;
57 	Rune yychar;
58 	Reclass *yycc;
59 	int yymin, yymax;
60 
61 	const char *error;
62 	jmp_buf kaboom;
63 };
64 
die(struct cstate * g,const char * message)65 static void die(struct cstate *g, const char *message)
66 {
67 	g->error = message;
68 	longjmp(g->kaboom, 1);
69 }
70 
canon(Rune c)71 static int canon(Rune c)
72 {
73 	Rune u = toupperrune(c);
74 	if (c >= 128 && u < 128)
75 		return c;
76 	return u;
77 }
78 
79 /* Scan */
80 
81 enum {
82 	L_CHAR = 256,
83 	L_CCLASS,	/* character class */
84 	L_NCCLASS,	/* negative character class */
85 	L_NC,		/* "(?:" no capture */
86 	L_PLA,		/* "(?=" positive lookahead */
87 	L_NLA,		/* "(?!" negative lookahead */
88 	L_WORD,		/* "\b" word boundary */
89 	L_NWORD,	/* "\B" non-word boundary */
90 	L_REF,		/* "\1" back-reference */
91 	L_COUNT,	/* {M,N} */
92 };
93 
hex(struct cstate * g,int c)94 static int hex(struct cstate *g, int c)
95 {
96 	if (c >= '0' && c <= '9') return c - '0';
97 	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
98 	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
99 	die(g, "invalid escape sequence");
100 	return 0;
101 }
102 
dec(struct cstate * g,int c)103 static int dec(struct cstate *g, int c)
104 {
105 	if (c >= '0' && c <= '9') return c - '0';
106 	die(g, "invalid quantifier");
107 	return 0;
108 }
109 
110 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
111 
isunicodeletter(int c)112 static int isunicodeletter(int c)
113 {
114 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
115 }
116 
nextrune(struct cstate * g)117 static int nextrune(struct cstate *g)
118 {
119 	if (!*g->source) {
120 		g->yychar = EOF;
121 		return 0;
122 	}
123 	g->source += chartorune(&g->yychar, g->source);
124 	if (g->yychar == '\\') {
125 		if (!*g->source)
126 			die(g, "unterminated escape sequence");
127 		g->source += chartorune(&g->yychar, g->source);
128 		switch (g->yychar) {
129 		case 'f': g->yychar = '\f'; return 0;
130 		case 'n': g->yychar = '\n'; return 0;
131 		case 'r': g->yychar = '\r'; return 0;
132 		case 't': g->yychar = '\t'; return 0;
133 		case 'v': g->yychar = '\v'; return 0;
134 		case 'c':
135 			if (!g->source[0])
136 				die(g, "unterminated escape sequence");
137 			g->yychar = (*g->source++) & 31;
138 			return 0;
139 		case 'x':
140 			if (!g->source[0] || !g->source[1])
141 				die(g, "unterminated escape sequence");
142 			g->yychar = hex(g, *g->source++) << 4;
143 			g->yychar += hex(g, *g->source++);
144 			if (g->yychar == 0) {
145 				g->yychar = '0';
146 				return 1;
147 			}
148 			return 0;
149 		case 'u':
150 			if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
151 				die(g, "unterminated escape sequence");
152 			g->yychar = hex(g, *g->source++) << 12;
153 			g->yychar += hex(g, *g->source++) << 8;
154 			g->yychar += hex(g, *g->source++) << 4;
155 			g->yychar += hex(g, *g->source++);
156 			if (g->yychar == 0) {
157 				g->yychar = '0';
158 				return 1;
159 			}
160 			return 0;
161 		case 0:
162 			g->yychar = '0';
163 			return 1;
164 		}
165 		if (strchr(ESCAPES, g->yychar))
166 			return 1;
167 		if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
168 			die(g, "invalid escape character");
169 		return 0;
170 	}
171 	return 0;
172 }
173 
lexcount(struct cstate * g)174 static int lexcount(struct cstate *g)
175 {
176 	g->yychar = *g->source++;
177 
178 	g->yymin = dec(g, g->yychar);
179 	g->yychar = *g->source++;
180 	while (g->yychar != ',' && g->yychar != '}') {
181 		g->yymin = g->yymin * 10 + dec(g, g->yychar);
182 		g->yychar = *g->source++;
183 		if (g->yymin >= REPINF)
184 			die(g, "numeric overflow");
185 	}
186 
187 	if (g->yychar == ',') {
188 		g->yychar = *g->source++;
189 		if (g->yychar == '}') {
190 			g->yymax = REPINF;
191 		} else {
192 			g->yymax = dec(g, g->yychar);
193 			g->yychar = *g->source++;
194 			while (g->yychar != '}') {
195 				g->yymax = g->yymax * 10 + dec(g, g->yychar);
196 				g->yychar = *g->source++;
197 				if (g->yymax >= REPINF)
198 					die(g, "numeric overflow");
199 			}
200 		}
201 	} else {
202 		g->yymax = g->yymin;
203 	}
204 
205 	return L_COUNT;
206 }
207 
newcclass(struct cstate * g)208 static void newcclass(struct cstate *g)
209 {
210 	if (g->ncclass >= nelem(g->prog->cclass))
211 		die(g, "too many character classes");
212 	g->yycc = g->prog->cclass + g->ncclass++;
213 	g->yycc->end = g->yycc->spans;
214 }
215 
addrange(struct cstate * g,Rune a,Rune b)216 static void addrange(struct cstate *g, Rune a, Rune b)
217 {
218 	if (a > b)
219 		die(g, "invalid character class range");
220 	if (g->yycc->end + 2 >= g->yycc->spans + nelem(g->yycc->spans))
221 		die(g, "too many character class ranges");
222 	*g->yycc->end++ = a;
223 	*g->yycc->end++ = b;
224 }
225 
addranges_d(struct cstate * g)226 static void addranges_d(struct cstate *g)
227 {
228 	addrange(g, '0', '9');
229 }
230 
addranges_D(struct cstate * g)231 static void addranges_D(struct cstate *g)
232 {
233 	addrange(g, 0, '0'-1);
234 	addrange(g, '9'+1, 0xFFFF);
235 }
236 
addranges_s(struct cstate * g)237 static void addranges_s(struct cstate *g)
238 {
239 	addrange(g, 0x9, 0xD);
240 	addrange(g, 0x20, 0x20);
241 	addrange(g, 0xA0, 0xA0);
242 	addrange(g, 0x2028, 0x2029);
243 	addrange(g, 0xFEFF, 0xFEFF);
244 }
245 
addranges_S(struct cstate * g)246 static void addranges_S(struct cstate *g)
247 {
248 	addrange(g, 0, 0x9-1);
249 	addrange(g, 0xD+1, 0x20-1);
250 	addrange(g, 0x20+1, 0xA0-1);
251 	addrange(g, 0xA0+1, 0x2028-1);
252 	addrange(g, 0x2029+1, 0xFEFF-1);
253 	addrange(g, 0xFEFF+1, 0xFFFF);
254 }
255 
addranges_w(struct cstate * g)256 static void addranges_w(struct cstate *g)
257 {
258 	addrange(g, '0', '9');
259 	addrange(g, 'A', 'Z');
260 	addrange(g, '_', '_');
261 	addrange(g, 'a', 'z');
262 }
263 
addranges_W(struct cstate * g)264 static void addranges_W(struct cstate *g)
265 {
266 	addrange(g, 0, '0'-1);
267 	addrange(g, '9'+1, 'A'-1);
268 	addrange(g, 'Z'+1, '_'-1);
269 	addrange(g, '_'+1, 'a'-1);
270 	addrange(g, 'z'+1, 0xFFFF);
271 }
272 
lexclass(struct cstate * g)273 static int lexclass(struct cstate *g)
274 {
275 	int type = L_CCLASS;
276 	int quoted, havesave, havedash;
277 	Rune save = 0;
278 
279 	newcclass(g);
280 
281 	quoted = nextrune(g);
282 	if (!quoted && g->yychar == '^') {
283 		type = L_NCCLASS;
284 		quoted = nextrune(g);
285 	}
286 
287 	havesave = havedash = 0;
288 	for (;;) {
289 		if (g->yychar == EOF)
290 			die(g, "unterminated character class");
291 		if (!quoted && g->yychar == ']')
292 			break;
293 
294 		if (!quoted && g->yychar == '-') {
295 			if (havesave) {
296 				if (havedash) {
297 					addrange(g, save, '-');
298 					havesave = havedash = 0;
299 				} else {
300 					havedash = 1;
301 				}
302 			} else {
303 				save = '-';
304 				havesave = 1;
305 			}
306 		} else if (quoted && strchr("DSWdsw", g->yychar)) {
307 			if (havesave) {
308 				addrange(g, save, save);
309 				if (havedash)
310 					addrange(g, '-', '-');
311 			}
312 			switch (g->yychar) {
313 			case 'd': addranges_d(g); break;
314 			case 's': addranges_s(g); break;
315 			case 'w': addranges_w(g); break;
316 			case 'D': addranges_D(g); break;
317 			case 'S': addranges_S(g); break;
318 			case 'W': addranges_W(g); break;
319 			}
320 			havesave = havedash = 0;
321 		} else {
322 			if (quoted) {
323 				if (g->yychar == 'b')
324 					g->yychar = '\b';
325 				else if (g->yychar == '0')
326 					g->yychar = 0;
327 				/* else identity escape */
328 			}
329 			if (havesave) {
330 				if (havedash) {
331 					addrange(g, save, g->yychar);
332 					havesave = havedash = 0;
333 				} else {
334 					addrange(g, save, save);
335 					save = g->yychar;
336 				}
337 			} else {
338 				save = g->yychar;
339 				havesave = 1;
340 			}
341 		}
342 
343 		quoted = nextrune(g);
344 	}
345 
346 	if (havesave) {
347 		addrange(g, save, save);
348 		if (havedash)
349 			addrange(g, '-', '-');
350 	}
351 
352 	return type;
353 }
354 
lex(struct cstate * g)355 static int lex(struct cstate *g)
356 {
357 	int quoted = nextrune(g);
358 	if (quoted) {
359 		switch (g->yychar) {
360 		case 'b': return L_WORD;
361 		case 'B': return L_NWORD;
362 		case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
363 		case 's': newcclass(g); addranges_s(g); return L_CCLASS;
364 		case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
365 		case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
366 		case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
367 		case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
368 		case '0': g->yychar = 0; return L_CHAR;
369 		}
370 		if (g->yychar >= '0' && g->yychar <= '9') {
371 			g->yychar -= '0';
372 			if (*g->source >= '0' && *g->source <= '9')
373 				g->yychar = g->yychar * 10 + *g->source++ - '0';
374 			return L_REF;
375 		}
376 		return L_CHAR;
377 	}
378 
379 	switch (g->yychar) {
380 	case EOF:
381 	case '$': case ')': case '*': case '+':
382 	case '.': case '?': case '^': case '|':
383 		return g->yychar;
384 	}
385 
386 	if (g->yychar == '{')
387 		return lexcount(g);
388 	if (g->yychar == '[')
389 		return lexclass(g);
390 	if (g->yychar == '(') {
391 		if (g->source[0] == '?') {
392 			if (g->source[1] == ':') {
393 				g->source += 2;
394 				return L_NC;
395 			}
396 			if (g->source[1] == '=') {
397 				g->source += 2;
398 				return L_PLA;
399 			}
400 			if (g->source[1] == '!') {
401 				g->source += 2;
402 				return L_NLA;
403 			}
404 		}
405 		return '(';
406 	}
407 
408 	return L_CHAR;
409 }
410 
411 /* Parse */
412 
413 enum {
414 	P_CAT, P_ALT, P_REP,
415 	P_BOL, P_EOL, P_WORD, P_NWORD,
416 	P_PAR, P_PLA, P_NLA,
417 	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
418 	P_REF,
419 };
420 
421 struct Renode {
422 	unsigned char type;
423 	unsigned char ng, m, n;
424 	Rune c;
425 	Reclass *cc;
426 	Renode *x;
427 	Renode *y;
428 };
429 
newnode(struct cstate * g,int type)430 static Renode *newnode(struct cstate *g, int type)
431 {
432 	Renode *node = g->pend++;
433 	node->type = type;
434 	node->cc = NULL;
435 	node->c = 0;
436 	node->ng = 0;
437 	node->m = 0;
438 	node->n = 0;
439 	node->x = node->y = NULL;
440 	return node;
441 }
442 
empty(Renode * node)443 static int empty(Renode *node)
444 {
445 	if (!node) return 1;
446 	switch (node->type) {
447 	default: return 1;
448 	case P_CAT: return empty(node->x) && empty(node->y);
449 	case P_ALT: return empty(node->x) || empty(node->y);
450 	case P_REP: return empty(node->x) || node->m == 0;
451 	case P_PAR: return empty(node->x);
452 	case P_REF: return empty(node->x);
453 	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
454 	}
455 }
456 
newrep(struct cstate * g,Renode * atom,int ng,int min,int max)457 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
458 {
459 	Renode *rep = newnode(g, P_REP);
460 	if (max == REPINF && empty(atom))
461 		die(g, "infinite loop matching the empty string");
462 	rep->ng = ng;
463 	rep->m = min;
464 	rep->n = max;
465 	rep->x = atom;
466 	return rep;
467 }
468 
next(struct cstate * g)469 static void next(struct cstate *g)
470 {
471 	g->lookahead = lex(g);
472 }
473 
accept(struct cstate * g,int t)474 static int accept(struct cstate *g, int t)
475 {
476 	if (g->lookahead == t) {
477 		next(g);
478 		return 1;
479 	}
480 	return 0;
481 }
482 
483 static Renode *parsealt(struct cstate *g);
484 
parseatom(struct cstate * g)485 static Renode *parseatom(struct cstate *g)
486 {
487 	Renode *atom;
488 	if (g->lookahead == L_CHAR) {
489 		atom = newnode(g, P_CHAR);
490 		atom->c = g->yychar;
491 		next(g);
492 		return atom;
493 	}
494 	if (g->lookahead == L_CCLASS) {
495 		atom = newnode(g, P_CCLASS);
496 		atom->cc = g->yycc;
497 		next(g);
498 		return atom;
499 	}
500 	if (g->lookahead == L_NCCLASS) {
501 		atom = newnode(g, P_NCCLASS);
502 		atom->cc = g->yycc;
503 		next(g);
504 		return atom;
505 	}
506 	if (g->lookahead == L_REF) {
507 		atom = newnode(g, P_REF);
508 		if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
509 			die(g, "invalid back-reference");
510 		atom->n = g->yychar;
511 		atom->x = g->sub[g->yychar];
512 		next(g);
513 		return atom;
514 	}
515 	if (accept(g, '.'))
516 		return newnode(g, P_ANY);
517 	if (accept(g, '(')) {
518 		atom = newnode(g, P_PAR);
519 		if (g->nsub == REG_MAXSUB)
520 			die(g, "too many captures");
521 		atom->n = g->nsub++;
522 		atom->x = parsealt(g);
523 		g->sub[atom->n] = atom;
524 		if (!accept(g, ')'))
525 			die(g, "unmatched '('");
526 		return atom;
527 	}
528 	if (accept(g, L_NC)) {
529 		atom = parsealt(g);
530 		if (!accept(g, ')'))
531 			die(g, "unmatched '('");
532 		return atom;
533 	}
534 	if (accept(g, L_PLA)) {
535 		atom = newnode(g, P_PLA);
536 		atom->x = parsealt(g);
537 		if (!accept(g, ')'))
538 			die(g, "unmatched '('");
539 		return atom;
540 	}
541 	if (accept(g, L_NLA)) {
542 		atom = newnode(g, P_NLA);
543 		atom->x = parsealt(g);
544 		if (!accept(g, ')'))
545 			die(g, "unmatched '('");
546 		return atom;
547 	}
548 	die(g, "syntax error");
549 	return NULL;
550 }
551 
parserep(struct cstate * g)552 static Renode *parserep(struct cstate *g)
553 {
554 	Renode *atom;
555 
556 	if (accept(g, '^')) return newnode(g, P_BOL);
557 	if (accept(g, '$')) return newnode(g, P_EOL);
558 	if (accept(g, L_WORD)) return newnode(g, P_WORD);
559 	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
560 
561 	atom = parseatom(g);
562 	if (g->lookahead == L_COUNT) {
563 		int min = g->yymin, max = g->yymax;
564 		next(g);
565 		if (max < min)
566 			die(g, "invalid quantifier");
567 		return newrep(g, atom, accept(g, '?'), min, max);
568 	}
569 	if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
570 	if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
571 	if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
572 	return atom;
573 }
574 
parsecat(struct cstate * g)575 static Renode *parsecat(struct cstate *g)
576 {
577 	Renode *cat, *head, **tail;
578 	if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
579 		/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
580 		head = parserep(g);
581 		tail = &head;
582 		while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
583 			cat = newnode(g, P_CAT);
584 			cat->x = *tail;
585 			cat->y = parserep(g);
586 			*tail = cat;
587 			tail = &cat->y;
588 		}
589 		return head;
590 	}
591 	return NULL;
592 }
593 
parsealt(struct cstate * g)594 static Renode *parsealt(struct cstate *g)
595 {
596 	Renode *alt, *x;
597 	alt = parsecat(g);
598 	while (accept(g, '|')) {
599 		x = alt;
600 		alt = newnode(g, P_ALT);
601 		alt->x = x;
602 		alt->y = parsecat(g);
603 	}
604 	return alt;
605 }
606 
607 /* Compile */
608 
609 enum {
610 	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
611 	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
612 	I_BOL, I_EOL, I_WORD, I_NWORD,
613 	I_LPAR, I_RPAR
614 };
615 
616 struct Reinst {
617 	unsigned char opcode;
618 	unsigned char n;
619 	Rune c;
620 	Reclass *cc;
621 	Reinst *x;
622 	Reinst *y;
623 };
624 
count(struct cstate * g,Renode * node)625 static int count(struct cstate *g, Renode *node)
626 {
627 	int min, max, n;
628 	if (!node) return 0;
629 	switch (node->type) {
630 	default: return 1;
631 	case P_CAT: return count(g, node->x) + count(g, node->y);
632 	case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
633 	case P_REP:
634 		min = node->m;
635 		max = node->n;
636 		if (min == max) n = count(g, node->x) * min;
637 		else if (max < REPINF) n = count(g, node->x) * max + (max - min);
638 		else n = count(g, node->x) * (min + 1) + 2;
639 		if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
640 		return n;
641 	case P_PAR: return count(g, node->x) + 2;
642 	case P_PLA: return count(g, node->x) + 2;
643 	case P_NLA: return count(g, node->x) + 2;
644 	}
645 }
646 
emit(Reprog * prog,int opcode)647 static Reinst *emit(Reprog *prog, int opcode)
648 {
649 	Reinst *inst = prog->end++;
650 	inst->opcode = opcode;
651 	inst->n = 0;
652 	inst->c = 0;
653 	inst->cc = NULL;
654 	inst->x = inst->y = NULL;
655 	return inst;
656 }
657 
compile(Reprog * prog,Renode * node)658 static void compile(Reprog *prog, Renode *node)
659 {
660 	Reinst *inst, *split, *jump;
661 	int i;
662 
663 loop:
664 	if (!node)
665 		return;
666 
667 	switch (node->type) {
668 	case P_CAT:
669 		compile(prog, node->x);
670 		node = node->y;
671 		goto loop;
672 
673 	case P_ALT:
674 		split = emit(prog, I_SPLIT);
675 		compile(prog, node->x);
676 		jump = emit(prog, I_JUMP);
677 		compile(prog, node->y);
678 		split->x = split + 1;
679 		split->y = jump + 1;
680 		jump->x = prog->end;
681 		break;
682 
683 	case P_REP:
684 		inst = NULL; /* silence compiler warning. assert(node->m > 0). */
685 		for (i = 0; i < node->m; ++i) {
686 			inst = prog->end;
687 			compile(prog, node->x);
688 		}
689 		if (node->m == node->n)
690 			break;
691 		if (node->n < REPINF) {
692 			for (i = node->m; i < node->n; ++i) {
693 				split = emit(prog, I_SPLIT);
694 				compile(prog, node->x);
695 				if (node->ng) {
696 					split->y = split + 1;
697 					split->x = prog->end;
698 				} else {
699 					split->x = split + 1;
700 					split->y = prog->end;
701 				}
702 			}
703 		} else if (node->m == 0) {
704 			split = emit(prog, I_SPLIT);
705 			compile(prog, node->x);
706 			jump = emit(prog, I_JUMP);
707 			if (node->ng) {
708 				split->y = split + 1;
709 				split->x = prog->end;
710 			} else {
711 				split->x = split + 1;
712 				split->y = prog->end;
713 			}
714 			jump->x = split;
715 		} else {
716 			split = emit(prog, I_SPLIT);
717 			if (node->ng) {
718 				split->y = inst;
719 				split->x = prog->end;
720 			} else {
721 				split->x = inst;
722 				split->y = prog->end;
723 			}
724 		}
725 		break;
726 
727 	case P_BOL: emit(prog, I_BOL); break;
728 	case P_EOL: emit(prog, I_EOL); break;
729 	case P_WORD: emit(prog, I_WORD); break;
730 	case P_NWORD: emit(prog, I_NWORD); break;
731 
732 	case P_PAR:
733 		inst = emit(prog, I_LPAR);
734 		inst->n = node->n;
735 		compile(prog, node->x);
736 		inst = emit(prog, I_RPAR);
737 		inst->n = node->n;
738 		break;
739 	case P_PLA:
740 		split = emit(prog, I_PLA);
741 		compile(prog, node->x);
742 		emit(prog, I_END);
743 		split->x = split + 1;
744 		split->y = prog->end;
745 		break;
746 	case P_NLA:
747 		split = emit(prog, I_NLA);
748 		compile(prog, node->x);
749 		emit(prog, I_END);
750 		split->x = split + 1;
751 		split->y = prog->end;
752 		break;
753 
754 	case P_ANY:
755 		emit(prog, I_ANY);
756 		break;
757 	case P_CHAR:
758 		inst = emit(prog, I_CHAR);
759 		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
760 		break;
761 	case P_CCLASS:
762 		inst = emit(prog, I_CCLASS);
763 		inst->cc = node->cc;
764 		break;
765 	case P_NCCLASS:
766 		inst = emit(prog, I_NCCLASS);
767 		inst->cc = node->cc;
768 		break;
769 	case P_REF:
770 		inst = emit(prog, I_REF);
771 		inst->n = node->n;
772 		break;
773 	}
774 }
775 
776 #ifdef TEST
dumpnode(Renode * node)777 static void dumpnode(Renode *node)
778 {
779 	Rune *p;
780 	if (!node) { printf("Empty"); return; }
781 	switch (node->type) {
782 	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
783 	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
784 	case P_REP:
785 		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
786 		dumpnode(node->x);
787 		printf(")");
788 		break;
789 	case P_BOL: printf("Bol"); break;
790 	case P_EOL: printf("Eol"); break;
791 	case P_WORD: printf("Word"); break;
792 	case P_NWORD: printf("NotWord"); break;
793 	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
794 	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
795 	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
796 	case P_ANY: printf("Any"); break;
797 	case P_CHAR: printf("Char(%c)", node->c); break;
798 	case P_CCLASS:
799 		printf("Class(");
800 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
801 		printf(")");
802 		break;
803 	case P_NCCLASS:
804 		printf("NotClass(");
805 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
806 		printf(")");
807 		break;
808 	case P_REF: printf("Ref(%d)", node->n); break;
809 	}
810 }
811 
dumpprog(Reprog * prog)812 static void dumpprog(Reprog *prog)
813 {
814 	Reinst *inst;
815 	int i;
816 	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
817 		printf("% 5d: ", i);
818 		switch (inst->opcode) {
819 		case I_END: puts("end"); break;
820 		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
821 		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
822 		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
823 		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
824 		case I_ANY: puts("any"); break;
825 		case I_ANYNL: puts("anynl"); break;
826 		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
827 		case I_CCLASS: puts("cclass"); break;
828 		case I_NCCLASS: puts("ncclass"); break;
829 		case I_REF: printf("ref %d\n", inst->n); break;
830 		case I_BOL: puts("bol"); break;
831 		case I_EOL: puts("eol"); break;
832 		case I_WORD: puts("word"); break;
833 		case I_NWORD: puts("nword"); break;
834 		case I_LPAR: printf("lpar %d\n", inst->n); break;
835 		case I_RPAR: printf("rpar %d\n", inst->n); break;
836 		}
837 	}
838 }
839 #endif
840 
regcompx(void * (* alloc)(void * ctx,void * p,int n),void * ctx,const char * pattern,int cflags,const char ** errorp)841 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
842 	const char *pattern, int cflags, const char **errorp)
843 {
844 	struct cstate g;
845 	Renode *node;
846 	Reinst *split, *jump;
847 	int i, n;
848 
849 	g.pstart = NULL;
850 	g.prog = NULL;
851 
852 	if (setjmp(g.kaboom)) {
853 		if (errorp) *errorp = g.error;
854 		alloc(ctx, g.pstart, 0);
855 		alloc(ctx, g.prog, 0);
856 		return NULL;
857 	}
858 
859 	g.prog = alloc(ctx, NULL, sizeof (Reprog));
860 	if (!g.prog)
861 		die(&g, "cannot allocate regular expression");
862 	n = strlen(pattern) * 2;
863 	if (n > REG_MAXPROG)
864 		die(&g, "program too large");
865 	if (n > 0) {
866 		g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
867 		if (!g.pstart)
868 			die(&g, "cannot allocate regular expression parse list");
869 	}
870 
871 	g.source = pattern;
872 	g.ncclass = 0;
873 	g.nsub = 1;
874 	for (i = 0; i < REG_MAXSUB; ++i)
875 		g.sub[i] = 0;
876 
877 	g.prog->flags = cflags;
878 
879 	next(&g);
880 	node = parsealt(&g);
881 	if (g.lookahead == ')')
882 		die(&g, "unmatched ')'");
883 	if (g.lookahead != EOF)
884 		die(&g, "syntax error");
885 
886 #ifdef TEST
887 	dumpnode(node);
888 	putchar('\n');
889 #endif
890 
891 	n = 6 + count(&g, node);
892 	if (n < 0 || n > REG_MAXPROG)
893 		die(&g, "program too large");
894 
895 	g.prog->nsub = g.nsub;
896 	g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
897 	if (!g.prog->start)
898 		die(&g, "cannot allocate regular expression instruction list");
899 
900 	split = emit(g.prog, I_SPLIT);
901 	split->x = split + 3;
902 	split->y = split + 1;
903 	emit(g.prog, I_ANYNL);
904 	jump = emit(g.prog, I_JUMP);
905 	jump->x = split;
906 	emit(g.prog, I_LPAR);
907 	compile(g.prog, node);
908 	emit(g.prog, I_RPAR);
909 	emit(g.prog, I_END);
910 
911 #ifdef TEST
912 	dumpprog(g.prog);
913 #endif
914 
915 	alloc(ctx, g.pstart, 0);
916 
917 	if (errorp) *errorp = NULL;
918 	return g.prog;
919 }
920 
regfreex(void * (* alloc)(void * ctx,void * p,int n),void * ctx,Reprog * prog)921 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
922 {
923 	if (prog) {
924 		alloc(ctx, prog->start, 0);
925 		alloc(ctx, prog, 0);
926 	}
927 }
928 
default_alloc(void * ctx,void * p,int n)929 static void *default_alloc(void *ctx, void *p, int n)
930 {
931 	return realloc(p, (size_t)n);
932 }
933 
regcomp(const char * pattern,int cflags,const char ** errorp)934 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
935 {
936 	return regcompx(default_alloc, NULL, pattern, cflags, errorp);
937 }
938 
regfree(Reprog * prog)939 void regfree(Reprog *prog)
940 {
941 	regfreex(default_alloc, NULL, prog);
942 }
943 
944 /* Match */
945 
isnewline(int c)946 static int isnewline(int c)
947 {
948 	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
949 }
950 
iswordchar(int c)951 static int iswordchar(int c)
952 {
953 	return c == '_' ||
954 		(c >= 'a' && c <= 'z') ||
955 		(c >= 'A' && c <= 'Z') ||
956 		(c >= '0' && c <= '9');
957 }
958 
incclass(Reclass * cc,Rune c)959 static int incclass(Reclass *cc, Rune c)
960 {
961 	Rune *p;
962 	for (p = cc->spans; p < cc->end; p += 2)
963 		if (p[0] <= c && c <= p[1])
964 			return 1;
965 	return 0;
966 }
967 
incclasscanon(Reclass * cc,Rune c)968 static int incclasscanon(Reclass *cc, Rune c)
969 {
970 	Rune *p, r;
971 	for (p = cc->spans; p < cc->end; p += 2)
972 		for (r = p[0]; r <= p[1]; ++r)
973 			if (c == canon(r))
974 				return 1;
975 	return 0;
976 }
977 
strncmpcanon(const char * a,const char * b,int n)978 static int strncmpcanon(const char *a, const char *b, int n)
979 {
980 	Rune ra, rb;
981 	int c;
982 	while (n--) {
983 		if (!*a) return -1;
984 		if (!*b) return 1;
985 		a += chartorune(&ra, a);
986 		b += chartorune(&rb, b);
987 		c = canon(ra) - canon(rb);
988 		if (c)
989 			return c;
990 	}
991 	return 0;
992 }
993 
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out,int depth)994 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
995 {
996 	Resub scratch;
997 	int result;
998 	int i;
999 	Rune c;
1000 
1001 	/* stack overflow */
1002 	if (depth > REG_MAXREC)
1003 		return -1;
1004 
1005 	for (;;) {
1006 		switch (pc->opcode) {
1007 		case I_END:
1008 			return 0;
1009 		case I_JUMP:
1010 			pc = pc->x;
1011 			break;
1012 		case I_SPLIT:
1013 			scratch = *out;
1014 			result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1015 			if (result == -1)
1016 				return -1;
1017 			if (result == 0) {
1018 				*out = scratch;
1019 				return 0;
1020 			}
1021 			pc = pc->y;
1022 			break;
1023 
1024 		case I_PLA:
1025 			result = match(pc->x, sp, bol, flags, out, depth+1);
1026 			if (result == -1)
1027 				return -1;
1028 			if (result == 1)
1029 				return 1;
1030 			pc = pc->y;
1031 			break;
1032 		case I_NLA:
1033 			scratch = *out;
1034 			result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1035 			if (result == -1)
1036 				return -1;
1037 			if (result == 0)
1038 				return 1;
1039 			pc = pc->y;
1040 			break;
1041 
1042 		case I_ANYNL:
1043 			if (!*sp) return 1;
1044 			sp += chartorune(&c, sp);
1045 			pc = pc + 1;
1046 			break;
1047 		case I_ANY:
1048 			if (!*sp) return 1;
1049 			sp += chartorune(&c, sp);
1050 			if (isnewline(c))
1051 				return 1;
1052 			pc = pc + 1;
1053 			break;
1054 		case I_CHAR:
1055 			if (!*sp) return 1;
1056 			sp += chartorune(&c, sp);
1057 			if (flags & REG_ICASE)
1058 				c = canon(c);
1059 			if (c != pc->c)
1060 				return 1;
1061 			pc = pc + 1;
1062 			break;
1063 		case I_CCLASS:
1064 			if (!*sp) return 1;
1065 			sp += chartorune(&c, sp);
1066 			if (flags & REG_ICASE) {
1067 				if (!incclasscanon(pc->cc, canon(c)))
1068 					return 1;
1069 			} else {
1070 				if (!incclass(pc->cc, c))
1071 					return 1;
1072 			}
1073 			pc = pc + 1;
1074 			break;
1075 		case I_NCCLASS:
1076 			if (!*sp) return 1;
1077 			sp += chartorune(&c, sp);
1078 			if (flags & REG_ICASE) {
1079 				if (incclasscanon(pc->cc, canon(c)))
1080 					return 1;
1081 			} else {
1082 				if (incclass(pc->cc, c))
1083 					return 1;
1084 			}
1085 			pc = pc + 1;
1086 			break;
1087 		case I_REF:
1088 			i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1089 			if (flags & REG_ICASE) {
1090 				if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1091 					return 1;
1092 			} else {
1093 				if (strncmp(sp, out->sub[pc->n].sp, i))
1094 					return 1;
1095 			}
1096 			if (i > 0)
1097 				sp += i;
1098 			pc = pc + 1;
1099 			break;
1100 
1101 		case I_BOL:
1102 			if (sp == bol && !(flags & REG_NOTBOL)) {
1103 				pc = pc + 1;
1104 				break;
1105 			}
1106 			if (flags & REG_NEWLINE) {
1107 				if (sp > bol && isnewline(sp[-1])) {
1108 					pc = pc + 1;
1109 					break;
1110 				}
1111 			}
1112 			return 1;
1113 		case I_EOL:
1114 			if (*sp == 0) {
1115 				pc = pc + 1;
1116 				break;
1117 			}
1118 			if (flags & REG_NEWLINE) {
1119 				if (isnewline(*sp)) {
1120 					pc = pc + 1;
1121 					break;
1122 				}
1123 			}
1124 			return 1;
1125 		case I_WORD:
1126 			i = sp > bol && iswordchar(sp[-1]);
1127 			i ^= iswordchar(sp[0]);
1128 			if (!i)
1129 				return 1;
1130 			pc = pc + 1;
1131 			break;
1132 		case I_NWORD:
1133 			i = sp > bol && iswordchar(sp[-1]);
1134 			i ^= iswordchar(sp[0]);
1135 			if (i)
1136 				return 1;
1137 			pc = pc + 1;
1138 			break;
1139 
1140 		case I_LPAR:
1141 			out->sub[pc->n].sp = sp;
1142 			pc = pc + 1;
1143 			break;
1144 		case I_RPAR:
1145 			out->sub[pc->n].ep = sp;
1146 			pc = pc + 1;
1147 			break;
1148 		default:
1149 			return 1;
1150 		}
1151 	}
1152 }
1153 
regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1154 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1155 {
1156 	Resub scratch;
1157 	int i;
1158 
1159 	if (!sub)
1160 		sub = &scratch;
1161 
1162 	sub->nsub = prog->nsub;
1163 	for (i = 0; i < REG_MAXSUB; ++i)
1164 		sub->sub[i].sp = sub->sub[i].ep = NULL;
1165 
1166 	return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1167 }
1168 
1169 #ifdef TEST
main(int argc,char ** argv)1170 int main(int argc, char **argv)
1171 {
1172 	const char *error;
1173 	const char *s;
1174 	Reprog *p;
1175 	Resub m;
1176 	int i;
1177 
1178 	if (argc > 1) {
1179 		p = regcomp(argv[1], 0, &error);
1180 		if (!p) {
1181 			fprintf(stderr, "regcomp: %s\n", error);
1182 			return 1;
1183 		}
1184 
1185 		if (argc > 2) {
1186 			s = argv[2];
1187 			printf("nsub = %d\n", p->nsub);
1188 			if (!regexec(p, s, &m, 0)) {
1189 				for (i = 0; i < m.nsub; ++i) {
1190 					int n = m.sub[i].ep - m.sub[i].sp;
1191 					if (n > 0)
1192 						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1193 					else
1194 						printf("match %d: n=0 ''\n", i);
1195 				}
1196 			} else {
1197 				printf("no match\n");
1198 			}
1199 		}
1200 	}
1201 
1202 	return 0;
1203 }
1204 #endif
1205