1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6 
7 #include "regexp.h"
8 #include "utf.h"
9 
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13 
14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15 
16 #define REPINF 255
17 #define MAXSUB REG_MAXSUB
18 #define MAXPROG (32 << 10)
19 
20 typedef struct Reclass Reclass;
21 typedef struct Renode Renode;
22 typedef struct Reinst Reinst;
23 typedef struct Rethread Rethread;
24 
25 struct Reclass {
26 	Rune *end;
27 	Rune spans[64];
28 };
29 
30 struct Reprog {
31 	Reinst *start, *end;
32 	int flags;
33 	int nsub;
34 	Reclass cclass[16];
35 };
36 
37 struct cstate {
38 	Reprog *prog;
39 	Renode *pstart, *pend;
40 
41 	const char *source;
42 	int ncclass;
43 	int nsub;
44 	Renode *sub[MAXSUB];
45 
46 	int lookahead;
47 	Rune yychar;
48 	Reclass *yycc;
49 	int yymin, yymax;
50 
51 	const char *error;
52 	jmp_buf kaboom;
53 };
54 
die(struct cstate * g,const char * message)55 static void die(struct cstate *g, const char *message)
56 {
57 	g->error = message;
58 	longjmp(g->kaboom, 1);
59 }
60 
canon(Rune c)61 static int canon(Rune c)
62 {
63 	Rune u = toupperrune(c);
64 	if (c >= 128 && u < 128)
65 		return c;
66 	return u;
67 }
68 
69 /* Scan */
70 
71 enum {
72 	L_CHAR = 256,
73 	L_CCLASS,	/* character class */
74 	L_NCCLASS,	/* negative character class */
75 	L_NC,		/* "(?:" no capture */
76 	L_PLA,		/* "(?=" positive lookahead */
77 	L_NLA,		/* "(?!" negative lookahead */
78 	L_WORD,		/* "\b" word boundary */
79 	L_NWORD,	/* "\B" non-word boundary */
80 	L_REF,		/* "\1" back-reference */
81 	L_COUNT,	/* {M,N} */
82 };
83 
hex(struct cstate * g,int c)84 static int hex(struct cstate *g, int c)
85 {
86 	if (c >= '0' && c <= '9') return c - '0';
87 	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
88 	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
89 	die(g, "invalid escape sequence");
90 	return 0;
91 }
92 
dec(struct cstate * g,int c)93 static int dec(struct cstate *g, int c)
94 {
95 	if (c >= '0' && c <= '9') return c - '0';
96 	die(g, "invalid quantifier");
97 	return 0;
98 }
99 
100 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
101 
isunicodeletter(int c)102 static int isunicodeletter(int c)
103 {
104 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
105 }
106 
nextrune(struct cstate * g)107 static int nextrune(struct cstate *g)
108 {
109 	g->source += chartorune(&g->yychar, g->source);
110 	if (g->yychar == '\\') {
111 		g->source += chartorune(&g->yychar, g->source);
112 		switch (g->yychar) {
113 		case 0: die(g, "unterminated escape sequence");
114 		case 'f': g->yychar = '\f'; return 0;
115 		case 'n': g->yychar = '\n'; return 0;
116 		case 'r': g->yychar = '\r'; return 0;
117 		case 't': g->yychar = '\t'; return 0;
118 		case 'v': g->yychar = '\v'; return 0;
119 		case 'c':
120 			g->yychar = (*g->source++) & 31;
121 			return 0;
122 		case 'x':
123 			g->yychar = hex(g, *g->source++) << 4;
124 			g->yychar += hex(g, *g->source++);
125 			if (g->yychar == 0) {
126 				g->yychar = '0';
127 				return 1;
128 			}
129 			return 0;
130 		case 'u':
131 			g->yychar = hex(g, *g->source++) << 12;
132 			g->yychar += hex(g, *g->source++) << 8;
133 			g->yychar += hex(g, *g->source++) << 4;
134 			g->yychar += hex(g, *g->source++);
135 			if (g->yychar == 0) {
136 				g->yychar = '0';
137 				return 1;
138 			}
139 			return 0;
140 		}
141 		if (strchr(ESCAPES, g->yychar))
142 			return 1;
143 		if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
144 			die(g, "invalid escape character");
145 		return 0;
146 	}
147 	return 0;
148 }
149 
lexcount(struct cstate * g)150 static int lexcount(struct cstate *g)
151 {
152 	g->yychar = *g->source++;
153 
154 	g->yymin = dec(g, g->yychar);
155 	g->yychar = *g->source++;
156 	while (g->yychar != ',' && g->yychar != '}') {
157 		g->yymin = g->yymin * 10 + dec(g, g->yychar);
158 		g->yychar = *g->source++;
159 	}
160 	if (g->yymin < 0 || g->yymin >= REPINF)
161 		die(g, "numeric overflow");
162 
163 	if (g->yychar == ',') {
164 		g->yychar = *g->source++;
165 		if (g->yychar == '}') {
166 			g->yymax = REPINF;
167 		} else {
168 			g->yymax = dec(g, g->yychar);
169 			g->yychar = *g->source++;
170 			while (g->yychar != '}') {
171 				g->yymax = g->yymax * 10 + dec(g, g->yychar);
172 				g->yychar = *g->source++;
173 			}
174 			if (g->yymax < 0 || g->yymax >= REPINF)
175 				die(g, "numeric overflow");
176 		}
177 	} else {
178 		g->yymax = g->yymin;
179 	}
180 
181 	return L_COUNT;
182 }
183 
newcclass(struct cstate * g)184 static void newcclass(struct cstate *g)
185 {
186 	if (g->ncclass >= nelem(g->prog->cclass))
187 		die(g, "too many character classes");
188 	g->yycc = g->prog->cclass + g->ncclass++;
189 	g->yycc->end = g->yycc->spans;
190 }
191 
addrange(struct cstate * g,Rune a,Rune b)192 static void addrange(struct cstate *g, Rune a, Rune b)
193 {
194 	if (a > b)
195 		die(g, "invalid character class range");
196 	if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
197 		die(g, "too many character class ranges");
198 	*g->yycc->end++ = a;
199 	*g->yycc->end++ = b;
200 }
201 
addranges_d(struct cstate * g)202 static void addranges_d(struct cstate *g)
203 {
204 	addrange(g, '0', '9');
205 }
206 
addranges_D(struct cstate * g)207 static void addranges_D(struct cstate *g)
208 {
209 	addrange(g, 0, '0'-1);
210 	addrange(g, '9'+1, 0xFFFF);
211 }
212 
addranges_s(struct cstate * g)213 static void addranges_s(struct cstate *g)
214 {
215 	addrange(g, 0x9, 0xD);
216 	addrange(g, 0x20, 0x20);
217 	addrange(g, 0xA0, 0xA0);
218 	addrange(g, 0x2028, 0x2029);
219 	addrange(g, 0xFEFF, 0xFEFF);
220 }
221 
addranges_S(struct cstate * g)222 static void addranges_S(struct cstate *g)
223 {
224 	addrange(g, 0, 0x9-1);
225 	addrange(g, 0xD+1, 0x20-1);
226 	addrange(g, 0x20+1, 0xA0-1);
227 	addrange(g, 0xA0+1, 0x2028-1);
228 	addrange(g, 0x2029+1, 0xFEFF-1);
229 	addrange(g, 0xFEFF+1, 0xFFFF);
230 }
231 
addranges_w(struct cstate * g)232 static void addranges_w(struct cstate *g)
233 {
234 	addrange(g, '0', '9');
235 	addrange(g, 'A', 'Z');
236 	addrange(g, '_', '_');
237 	addrange(g, 'a', 'z');
238 }
239 
addranges_W(struct cstate * g)240 static void addranges_W(struct cstate *g)
241 {
242 	addrange(g, 0, '0'-1);
243 	addrange(g, '9'+1, 'A'-1);
244 	addrange(g, 'Z'+1, '_'-1);
245 	addrange(g, '_'+1, 'a'-1);
246 	addrange(g, 'z'+1, 0xFFFF);
247 }
248 
lexclass(struct cstate * g)249 static int lexclass(struct cstate *g)
250 {
251 	int type = L_CCLASS;
252 	int quoted, havesave, havedash;
253 	Rune save = 0;
254 
255 	newcclass(g);
256 
257 	quoted = nextrune(g);
258 	if (!quoted && g->yychar == '^') {
259 		type = L_NCCLASS;
260 		quoted = nextrune(g);
261 	}
262 
263 	havesave = havedash = 0;
264 	for (;;) {
265 		if (g->yychar == 0)
266 			die(g, "unterminated character class");
267 		if (!quoted && g->yychar == ']')
268 			break;
269 
270 		if (!quoted && g->yychar == '-') {
271 			if (havesave) {
272 				if (havedash) {
273 					addrange(g, save, '-');
274 					havesave = havedash = 0;
275 				} else {
276 					havedash = 1;
277 				}
278 			} else {
279 				save = '-';
280 				havesave = 1;
281 			}
282 		} else if (quoted && strchr("DSWdsw", g->yychar)) {
283 			if (havesave) {
284 				addrange(g, save, save);
285 				if (havedash)
286 					addrange(g, '-', '-');
287 			}
288 			switch (g->yychar) {
289 			case 'd': addranges_d(g); break;
290 			case 's': addranges_s(g); break;
291 			case 'w': addranges_w(g); break;
292 			case 'D': addranges_D(g); break;
293 			case 'S': addranges_S(g); break;
294 			case 'W': addranges_W(g); break;
295 			}
296 			havesave = havedash = 0;
297 		} else {
298 			if (quoted) {
299 				if (g->yychar == 'b')
300 					g->yychar = '\b';
301 				else if (g->yychar == '0')
302 					g->yychar = 0;
303 				/* else identity escape */
304 			}
305 			if (havesave) {
306 				if (havedash) {
307 					addrange(g, save, g->yychar);
308 					havesave = havedash = 0;
309 				} else {
310 					addrange(g, save, save);
311 					save = g->yychar;
312 				}
313 			} else {
314 				save = g->yychar;
315 				havesave = 1;
316 			}
317 		}
318 
319 		quoted = nextrune(g);
320 	}
321 
322 	if (havesave) {
323 		addrange(g, save, save);
324 		if (havedash)
325 			addrange(g, '-', '-');
326 	}
327 
328 	return type;
329 }
330 
lex(struct cstate * g)331 static int lex(struct cstate *g)
332 {
333 	int quoted = nextrune(g);
334 	if (quoted) {
335 		switch (g->yychar) {
336 		case 'b': return L_WORD;
337 		case 'B': return L_NWORD;
338 		case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
339 		case 's': newcclass(g); addranges_s(g); return L_CCLASS;
340 		case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
341 		case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
342 		case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
343 		case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
344 		case '0': g->yychar = 0; return L_CHAR;
345 		}
346 		if (g->yychar >= '0' && g->yychar <= '9') {
347 			g->yychar -= '0';
348 			if (*g->source >= '0' && *g->source <= '9')
349 				g->yychar = g->yychar * 10 + *g->source++ - '0';
350 			return L_REF;
351 		}
352 		return L_CHAR;
353 	}
354 
355 	switch (g->yychar) {
356 	case 0:
357 	case '$': case ')': case '*': case '+':
358 	case '.': case '?': case '^': case '|':
359 		return g->yychar;
360 	}
361 
362 	if (g->yychar == '{')
363 		return lexcount(g);
364 	if (g->yychar == '[')
365 		return lexclass(g);
366 	if (g->yychar == '(') {
367 		if (g->source[0] == '?') {
368 			if (g->source[1] == ':') {
369 				g->source += 2;
370 				return L_NC;
371 			}
372 			if (g->source[1] == '=') {
373 				g->source += 2;
374 				return L_PLA;
375 			}
376 			if (g->source[1] == '!') {
377 				g->source += 2;
378 				return L_NLA;
379 			}
380 		}
381 		return '(';
382 	}
383 
384 	return L_CHAR;
385 }
386 
387 /* Parse */
388 
389 enum {
390 	P_CAT, P_ALT, P_REP,
391 	P_BOL, P_EOL, P_WORD, P_NWORD,
392 	P_PAR, P_PLA, P_NLA,
393 	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
394 	P_REF,
395 };
396 
397 struct Renode {
398 	unsigned char type;
399 	unsigned char ng, m, n;
400 	Rune c;
401 	Reclass *cc;
402 	Renode *x;
403 	Renode *y;
404 };
405 
newnode(struct cstate * g,int type)406 static Renode *newnode(struct cstate *g, int type)
407 {
408 	Renode *node = g->pend++;
409 	node->type = type;
410 	node->cc = NULL;
411 	node->c = 0;
412 	node->ng = 0;
413 	node->m = 0;
414 	node->n = 0;
415 	node->x = node->y = NULL;
416 	return node;
417 }
418 
empty(Renode * node)419 static int empty(Renode *node)
420 {
421 	if (!node) return 1;
422 	switch (node->type) {
423 	default: return 1;
424 	case P_CAT: return empty(node->x) && empty(node->y);
425 	case P_ALT: return empty(node->x) || empty(node->y);
426 	case P_REP: return empty(node->x) || node->m == 0;
427 	case P_PAR: return empty(node->x);
428 	case P_REF: return empty(node->x);
429 	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
430 	}
431 }
432 
newrep(struct cstate * g,Renode * atom,int ng,int min,int max)433 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
434 {
435 	Renode *rep = newnode(g, P_REP);
436 	if (max == REPINF && empty(atom))
437 		die(g, "infinite loop matching the empty string");
438 	rep->ng = ng;
439 	rep->m = min;
440 	rep->n = max;
441 	rep->x = atom;
442 	return rep;
443 }
444 
next(struct cstate * g)445 static void next(struct cstate *g)
446 {
447 	g->lookahead = lex(g);
448 }
449 
accept(struct cstate * g,int t)450 static int accept(struct cstate *g, int t)
451 {
452 	if (g->lookahead == t) {
453 		next(g);
454 		return 1;
455 	}
456 	return 0;
457 }
458 
459 static Renode *parsealt(struct cstate *g);
460 
parseatom(struct cstate * g)461 static Renode *parseatom(struct cstate *g)
462 {
463 	Renode *atom;
464 	if (g->lookahead == L_CHAR) {
465 		atom = newnode(g, P_CHAR);
466 		atom->c = g->yychar;
467 		next(g);
468 		return atom;
469 	}
470 	if (g->lookahead == L_CCLASS) {
471 		atom = newnode(g, P_CCLASS);
472 		atom->cc = g->yycc;
473 		next(g);
474 		return atom;
475 	}
476 	if (g->lookahead == L_NCCLASS) {
477 		atom = newnode(g, P_NCCLASS);
478 		atom->cc = g->yycc;
479 		next(g);
480 		return atom;
481 	}
482 	if (g->lookahead == L_REF) {
483 		atom = newnode(g, P_REF);
484 		if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
485 			die(g, "invalid back-reference");
486 		atom->n = g->yychar;
487 		atom->x = g->sub[g->yychar];
488 		next(g);
489 		return atom;
490 	}
491 	if (accept(g, '.'))
492 		return newnode(g, P_ANY);
493 	if (accept(g, '(')) {
494 		atom = newnode(g, P_PAR);
495 		if (g->nsub == MAXSUB)
496 			die(g, "too many captures");
497 		atom->n = g->nsub++;
498 		atom->x = parsealt(g);
499 		g->sub[atom->n] = atom;
500 		if (!accept(g, ')'))
501 			die(g, "unmatched '('");
502 		return atom;
503 	}
504 	if (accept(g, L_NC)) {
505 		atom = parsealt(g);
506 		if (!accept(g, ')'))
507 			die(g, "unmatched '('");
508 		return atom;
509 	}
510 	if (accept(g, L_PLA)) {
511 		atom = newnode(g, P_PLA);
512 		atom->x = parsealt(g);
513 		if (!accept(g, ')'))
514 			die(g, "unmatched '('");
515 		return atom;
516 	}
517 	if (accept(g, L_NLA)) {
518 		atom = newnode(g, P_NLA);
519 		atom->x = parsealt(g);
520 		if (!accept(g, ')'))
521 			die(g, "unmatched '('");
522 		return atom;
523 	}
524 	die(g, "syntax error");
525 	return NULL;
526 }
527 
parserep(struct cstate * g)528 static Renode *parserep(struct cstate *g)
529 {
530 	Renode *atom;
531 
532 	if (accept(g, '^')) return newnode(g, P_BOL);
533 	if (accept(g, '$')) return newnode(g, P_EOL);
534 	if (accept(g, L_WORD)) return newnode(g, P_WORD);
535 	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
536 
537 	atom = parseatom(g);
538 	if (g->lookahead == L_COUNT) {
539 		int min = g->yymin, max = g->yymax;
540 		next(g);
541 		if (max < min)
542 			die(g, "invalid quantifier");
543 		return newrep(g, atom, accept(g, '?'), min, max);
544 	}
545 	if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
546 	if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
547 	if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
548 	return atom;
549 }
550 
parsecat(struct cstate * g)551 static Renode *parsecat(struct cstate *g)
552 {
553 	Renode *cat, *x;
554 	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
555 		cat = parserep(g);
556 		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
557 			x = cat;
558 			cat = newnode(g, P_CAT);
559 			cat->x = x;
560 			cat->y = parserep(g);
561 		}
562 		return cat;
563 	}
564 	return NULL;
565 }
566 
parsealt(struct cstate * g)567 static Renode *parsealt(struct cstate *g)
568 {
569 	Renode *alt, *x;
570 	alt = parsecat(g);
571 	while (accept(g, '|')) {
572 		x = alt;
573 		alt = newnode(g, P_ALT);
574 		alt->x = x;
575 		alt->y = parsecat(g);
576 	}
577 	return alt;
578 }
579 
580 /* Compile */
581 
582 enum {
583 	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
584 	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
585 	I_BOL, I_EOL, I_WORD, I_NWORD,
586 	I_LPAR, I_RPAR
587 };
588 
589 struct Reinst {
590 	unsigned char opcode;
591 	unsigned char n;
592 	Rune c;
593 	Reclass *cc;
594 	Reinst *x;
595 	Reinst *y;
596 };
597 
count(struct cstate * g,Renode * node)598 static int count(struct cstate *g, Renode *node)
599 {
600 	int min, max, n;
601 	if (!node) return 0;
602 	switch (node->type) {
603 	default: return 1;
604 	case P_CAT: return count(g, node->x) + count(g, node->y);
605 	case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
606 	case P_REP:
607 		min = node->m;
608 		max = node->n;
609 		if (min == max) n = count(g, node->x) * min;
610 		else if (max < REPINF) n = count(g, node->x) * max + (max - min);
611 		else n = count(g, node->x) * (min + 1) + 2;
612 		if (n < 0 || n > MAXPROG) die(g, "program too large");
613 		return n;
614 	case P_PAR: return count(g, node->x) + 2;
615 	case P_PLA: return count(g, node->x) + 2;
616 	case P_NLA: return count(g, node->x) + 2;
617 	}
618 }
619 
emit(Reprog * prog,int opcode)620 static Reinst *emit(Reprog *prog, int opcode)
621 {
622 	Reinst *inst = prog->end++;
623 	inst->opcode = opcode;
624 	inst->n = 0;
625 	inst->c = 0;
626 	inst->cc = NULL;
627 	inst->x = inst->y = NULL;
628 	return inst;
629 }
630 
compile(Reprog * prog,Renode * node)631 static void compile(Reprog *prog, Renode *node)
632 {
633 	Reinst *inst, *split, *jump;
634 	int i;
635 
636 	if (!node)
637 		return;
638 
639 	switch (node->type) {
640 	case P_CAT:
641 		compile(prog, node->x);
642 		compile(prog, node->y);
643 		break;
644 
645 	case P_ALT:
646 		split = emit(prog, I_SPLIT);
647 		compile(prog, node->x);
648 		jump = emit(prog, I_JUMP);
649 		compile(prog, node->y);
650 		split->x = split + 1;
651 		split->y = jump + 1;
652 		jump->x = prog->end;
653 		break;
654 
655 	case P_REP:
656 		inst = NULL; /* silence compiler warning. assert(node->m > 0). */
657 		for (i = 0; i < node->m; ++i) {
658 			inst = prog->end;
659 			compile(prog, node->x);
660 		}
661 		if (node->m == node->n)
662 			break;
663 		if (node->n < REPINF) {
664 			for (i = node->m; i < node->n; ++i) {
665 				split = emit(prog, I_SPLIT);
666 				compile(prog, node->x);
667 				if (node->ng) {
668 					split->y = split + 1;
669 					split->x = prog->end;
670 				} else {
671 					split->x = split + 1;
672 					split->y = prog->end;
673 				}
674 			}
675 		} else if (node->m == 0) {
676 			split = emit(prog, I_SPLIT);
677 			compile(prog, node->x);
678 			jump = emit(prog, I_JUMP);
679 			if (node->ng) {
680 				split->y = split + 1;
681 				split->x = prog->end;
682 			} else {
683 				split->x = split + 1;
684 				split->y = prog->end;
685 			}
686 			jump->x = split;
687 		} else {
688 			split = emit(prog, I_SPLIT);
689 			if (node->ng) {
690 				split->y = inst;
691 				split->x = prog->end;
692 			} else {
693 				split->x = inst;
694 				split->y = prog->end;
695 			}
696 		}
697 		break;
698 
699 	case P_BOL: emit(prog, I_BOL); break;
700 	case P_EOL: emit(prog, I_EOL); break;
701 	case P_WORD: emit(prog, I_WORD); break;
702 	case P_NWORD: emit(prog, I_NWORD); break;
703 
704 	case P_PAR:
705 		inst = emit(prog, I_LPAR);
706 		inst->n = node->n;
707 		compile(prog, node->x);
708 		inst = emit(prog, I_RPAR);
709 		inst->n = node->n;
710 		break;
711 	case P_PLA:
712 		split = emit(prog, I_PLA);
713 		compile(prog, node->x);
714 		emit(prog, I_END);
715 		split->x = split + 1;
716 		split->y = prog->end;
717 		break;
718 	case P_NLA:
719 		split = emit(prog, I_NLA);
720 		compile(prog, node->x);
721 		emit(prog, I_END);
722 		split->x = split + 1;
723 		split->y = prog->end;
724 		break;
725 
726 	case P_ANY:
727 		emit(prog, I_ANY);
728 		break;
729 	case P_CHAR:
730 		inst = emit(prog, I_CHAR);
731 		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
732 		break;
733 	case P_CCLASS:
734 		inst = emit(prog, I_CCLASS);
735 		inst->cc = node->cc;
736 		break;
737 	case P_NCCLASS:
738 		inst = emit(prog, I_NCCLASS);
739 		inst->cc = node->cc;
740 		break;
741 	case P_REF:
742 		inst = emit(prog, I_REF);
743 		inst->n = node->n;
744 		break;
745 	}
746 }
747 
748 #ifdef TEST
dumpnode(Renode * node)749 static void dumpnode(Renode *node)
750 {
751 	Rune *p;
752 	if (!node) { printf("Empty"); return; }
753 	switch (node->type) {
754 	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
755 	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
756 	case P_REP:
757 		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
758 		dumpnode(node->x);
759 		printf(")");
760 		break;
761 	case P_BOL: printf("Bol"); break;
762 	case P_EOL: printf("Eol"); break;
763 	case P_WORD: printf("Word"); break;
764 	case P_NWORD: printf("NotWord"); break;
765 	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
766 	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
767 	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
768 	case P_ANY: printf("Any"); break;
769 	case P_CHAR: printf("Char(%c)", node->c); break;
770 	case P_CCLASS:
771 		printf("Class(");
772 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
773 		printf(")");
774 		break;
775 	case P_NCCLASS:
776 		printf("NotClass(");
777 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
778 		printf(")");
779 		break;
780 	case P_REF: printf("Ref(%d)", node->n); break;
781 	}
782 }
783 
dumpprog(Reprog * prog)784 static void dumpprog(Reprog *prog)
785 {
786 	Reinst *inst;
787 	int i;
788 	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
789 		printf("% 5d: ", i);
790 		switch (inst->opcode) {
791 		case I_END: puts("end"); break;
792 		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
793 		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
794 		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
795 		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
796 		case I_ANY: puts("any"); break;
797 		case I_ANYNL: puts("anynl"); break;
798 		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
799 		case I_CCLASS: puts("cclass"); break;
800 		case I_NCCLASS: puts("ncclass"); break;
801 		case I_REF: printf("ref %d\n", inst->n); break;
802 		case I_BOL: puts("bol"); break;
803 		case I_EOL: puts("eol"); break;
804 		case I_WORD: puts("word"); break;
805 		case I_NWORD: puts("nword"); break;
806 		case I_LPAR: printf("lpar %d\n", inst->n); break;
807 		case I_RPAR: printf("rpar %d\n", inst->n); break;
808 		}
809 	}
810 }
811 #endif
812 
regcompx(void * (* alloc)(void * ctx,void * p,int n),void * ctx,const char * pattern,int cflags,const char ** errorp)813 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
814 	const char *pattern, int cflags, const char **errorp)
815 {
816 	struct cstate g;
817 	Renode *node;
818 	Reinst *split, *jump;
819 	int i, n;
820 
821 	g.pstart = NULL;
822 	g.prog = NULL;
823 
824 	if (setjmp(g.kaboom)) {
825 		if (errorp) *errorp = g.error;
826 		alloc(ctx, g.pstart, 0);
827 		alloc(ctx, g.prog, 0);
828 		return NULL;
829 	}
830 
831 	g.prog = alloc(ctx, NULL, sizeof (Reprog));
832 	if (!g.prog)
833 		die(&g, "cannot allocate regular expression");
834 	n = strlen(pattern) * 2;
835 	if (n > 0) {
836 		g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
837 		if (!g.pstart)
838 			die(&g, "cannot allocate regular expression parse list");
839 	}
840 
841 	g.source = pattern;
842 	g.ncclass = 0;
843 	g.nsub = 1;
844 	for (i = 0; i < MAXSUB; ++i)
845 		g.sub[i] = 0;
846 
847 	g.prog->flags = cflags;
848 
849 	next(&g);
850 	node = parsealt(&g);
851 	if (g.lookahead == ')')
852 		die(&g, "unmatched ')'");
853 	if (g.lookahead != 0)
854 		die(&g, "syntax error");
855 
856 #ifdef TEST
857 	dumpnode(node);
858 	putchar('\n');
859 #endif
860 
861 	n = 6 + count(&g, node);
862 	if (n < 0 || n > MAXPROG)
863 		die(&g, "program too large");
864 
865 	g.prog->nsub = g.nsub;
866 	g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
867 	if (!g.prog->start)
868 		die(&g, "cannot allocate regular expression instruction list");
869 
870 	split = emit(g.prog, I_SPLIT);
871 	split->x = split + 3;
872 	split->y = split + 1;
873 	emit(g.prog, I_ANYNL);
874 	jump = emit(g.prog, I_JUMP);
875 	jump->x = split;
876 	emit(g.prog, I_LPAR);
877 	compile(g.prog, node);
878 	emit(g.prog, I_RPAR);
879 	emit(g.prog, I_END);
880 
881 #ifdef TEST
882 	dumpprog(g.prog);
883 #endif
884 
885 	alloc(ctx, g.pstart, 0);
886 
887 	if (errorp) *errorp = NULL;
888 	return g.prog;
889 }
890 
regfreex(void * (* alloc)(void * ctx,void * p,int n),void * ctx,Reprog * prog)891 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
892 {
893 	if (prog) {
894 		alloc(ctx, prog->start, 0);
895 		alloc(ctx, prog, 0);
896 	}
897 }
898 
default_alloc(void * ctx,void * p,int n)899 static void *default_alloc(void *ctx, void *p, int n)
900 {
901 	return realloc(p, (size_t)n);
902 }
903 
regcomp(const char * pattern,int cflags,const char ** errorp)904 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
905 {
906 	return regcompx(default_alloc, NULL, pattern, cflags, errorp);
907 }
908 
regfree(Reprog * prog)909 void regfree(Reprog *prog)
910 {
911 	regfreex(default_alloc, NULL, prog);
912 }
913 
914 /* Match */
915 
isnewline(int c)916 static int isnewline(int c)
917 {
918 	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
919 }
920 
iswordchar(int c)921 static int iswordchar(int c)
922 {
923 	return c == '_' ||
924 		(c >= 'a' && c <= 'z') ||
925 		(c >= 'A' && c <= 'Z') ||
926 		(c >= '0' && c <= '9');
927 }
928 
incclass(Reclass * cc,Rune c)929 static int incclass(Reclass *cc, Rune c)
930 {
931 	Rune *p;
932 	for (p = cc->spans; p < cc->end; p += 2)
933 		if (p[0] <= c && c <= p[1])
934 			return 1;
935 	return 0;
936 }
937 
incclasscanon(Reclass * cc,Rune c)938 static int incclasscanon(Reclass *cc, Rune c)
939 {
940 	Rune *p, r;
941 	for (p = cc->spans; p < cc->end; p += 2)
942 		for (r = p[0]; r <= p[1]; ++r)
943 			if (c == canon(r))
944 				return 1;
945 	return 0;
946 }
947 
strncmpcanon(const char * a,const char * b,int n)948 static int strncmpcanon(const char *a, const char *b, int n)
949 {
950 	Rune ra, rb;
951 	int c;
952 	while (n--) {
953 		if (!*a) return -1;
954 		if (!*b) return 1;
955 		a += chartorune(&ra, a);
956 		b += chartorune(&rb, b);
957 		c = canon(ra) - canon(rb);
958 		if (c)
959 			return c;
960 	}
961 	return 0;
962 }
963 
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out)964 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
965 {
966 	Resub scratch;
967 	int i;
968 	Rune c;
969 
970 	for (;;) {
971 		switch (pc->opcode) {
972 		case I_END:
973 			return 1;
974 		case I_JUMP:
975 			pc = pc->x;
976 			break;
977 		case I_SPLIT:
978 			scratch = *out;
979 			if (match(pc->x, sp, bol, flags, &scratch)) {
980 				*out = scratch;
981 				return 1;
982 			}
983 			pc = pc->y;
984 			break;
985 
986 		case I_PLA:
987 			if (!match(pc->x, sp, bol, flags, out))
988 				return 0;
989 			pc = pc->y;
990 			break;
991 		case I_NLA:
992 			scratch = *out;
993 			if (match(pc->x, sp, bol, flags, &scratch))
994 				return 0;
995 			pc = pc->y;
996 			break;
997 
998 		case I_ANYNL:
999 			sp += chartorune(&c, sp);
1000 			if (c == 0)
1001 				return 0;
1002 			pc = pc + 1;
1003 			break;
1004 		case I_ANY:
1005 			sp += chartorune(&c, sp);
1006 			if (c == 0)
1007 				return 0;
1008 			if (isnewline(c))
1009 				return 0;
1010 			pc = pc + 1;
1011 			break;
1012 		case I_CHAR:
1013 			sp += chartorune(&c, sp);
1014 			if (c == 0)
1015 				return 0;
1016 			if (flags & REG_ICASE)
1017 				c = canon(c);
1018 			if (c != pc->c)
1019 				return 0;
1020 			pc = pc + 1;
1021 			break;
1022 		case I_CCLASS:
1023 			sp += chartorune(&c, sp);
1024 			if (c == 0)
1025 				return 0;
1026 			if (flags & REG_ICASE) {
1027 				if (!incclasscanon(pc->cc, canon(c)))
1028 					return 0;
1029 			} else {
1030 				if (!incclass(pc->cc, c))
1031 					return 0;
1032 			}
1033 			pc = pc + 1;
1034 			break;
1035 		case I_NCCLASS:
1036 			sp += chartorune(&c, sp);
1037 			if (c == 0)
1038 				return 0;
1039 			if (flags & REG_ICASE) {
1040 				if (incclasscanon(pc->cc, canon(c)))
1041 					return 0;
1042 			} else {
1043 				if (incclass(pc->cc, c))
1044 					return 0;
1045 			}
1046 			pc = pc + 1;
1047 			break;
1048 		case I_REF:
1049 			i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1050 			if (flags & REG_ICASE) {
1051 				if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1052 					return 0;
1053 			} else {
1054 				if (strncmp(sp, out->sub[pc->n].sp, i))
1055 					return 0;
1056 			}
1057 			if (i > 0)
1058 				sp += i;
1059 			pc = pc + 1;
1060 			break;
1061 
1062 		case I_BOL:
1063 			if (sp == bol && !(flags & REG_NOTBOL)) {
1064 				pc = pc + 1;
1065 				break;
1066 			}
1067 			if (flags & REG_NEWLINE) {
1068 				if (sp > bol && isnewline(sp[-1])) {
1069 					pc = pc + 1;
1070 					break;
1071 				}
1072 			}
1073 			return 0;
1074 		case I_EOL:
1075 			if (*sp == 0) {
1076 				pc = pc + 1;
1077 				break;
1078 			}
1079 			if (flags & REG_NEWLINE) {
1080 				if (isnewline(*sp)) {
1081 					pc = pc + 1;
1082 					break;
1083 				}
1084 			}
1085 			return 0;
1086 		case I_WORD:
1087 			i = sp > bol && iswordchar(sp[-1]);
1088 			i ^= iswordchar(sp[0]);
1089 			if (!i)
1090 				return 0;
1091 			pc = pc + 1;
1092 			break;
1093 		case I_NWORD:
1094 			i = sp > bol && iswordchar(sp[-1]);
1095 			i ^= iswordchar(sp[0]);
1096 			if (i)
1097 				return 0;
1098 			pc = pc + 1;
1099 			break;
1100 
1101 		case I_LPAR:
1102 			out->sub[pc->n].sp = sp;
1103 			pc = pc + 1;
1104 			break;
1105 		case I_RPAR:
1106 			out->sub[pc->n].ep = sp;
1107 			pc = pc + 1;
1108 			break;
1109 		default:
1110 			return 0;
1111 		}
1112 	}
1113 }
1114 
regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1115 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1116 {
1117 	Resub scratch;
1118 	int i;
1119 
1120 	if (!sub)
1121 		sub = &scratch;
1122 
1123 	sub->nsub = prog->nsub;
1124 	for (i = 0; i < MAXSUB; ++i)
1125 		sub->sub[i].sp = sub->sub[i].ep = NULL;
1126 
1127 	return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1128 }
1129 
1130 #ifdef TEST
main(int argc,char ** argv)1131 int main(int argc, char **argv)
1132 {
1133 	const char *error;
1134 	const char *s;
1135 	Reprog *p;
1136 	Resub m;
1137 	int i;
1138 
1139 	if (argc > 1) {
1140 		p = regcomp(argv[1], 0, &error);
1141 		if (!p) {
1142 			fprintf(stderr, "regcomp: %s\n", error);
1143 			return 1;
1144 		}
1145 
1146 		if (argc > 2) {
1147 			s = argv[2];
1148 			printf("nsub = %d\n", p->nsub);
1149 			if (!regexec(p, s, &m, 0)) {
1150 				for (i = 0; i < m.nsub; ++i) {
1151 					int n = m.sub[i].ep - m.sub[i].sp;
1152 					if (n > 0)
1153 						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1154 					else
1155 						printf("match %d: n=0 ''\n", i);
1156 				}
1157 			} else {
1158 				printf("no match\n");
1159 			}
1160 		}
1161 	}
1162 
1163 	return 0;
1164 }
1165 #endif
1166