1 /**
2  * Copyright: public domain
3  *
4  * From https://github.com/ccxvii/minilibs sha 875c33568b5a4aa4fb3dd0c52ea98f7f0e5ca684:
5  *
6  * These libraries are in the public domain (or the equivalent where that is not possible).
7  * You can do anything you want with them. You have no legal obligation to do anything else,
8  * although I appreciate attribution.
9  */
10 
11 #include "rd.h"
12 
13 #include <stdlib.h>
14 #include <string.h>
15 #include <setjmp.h>
16 #include <stdio.h>
17 
18 #include "regexp.h"
19 
20 #define nelem(a) (sizeof (a) / sizeof (a)[0])
21 
22 typedef unsigned int Rune;
23 
isalpharune(Rune c)24 static int isalpharune(Rune c)
25 {
26 	/* TODO: Add unicode support */
27 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
28 }
29 
toupperrune(Rune c)30 static Rune toupperrune(Rune c)
31 {
32 	/* TODO: Add unicode support */
33 	if (c >= 'a' && c <= 'z')
34 		return c - 'a' + 'A';
35 	return c;
36 }
37 
chartorune(Rune * r,const char * s)38 static int chartorune(Rune *r, const char *s)
39 {
40 	/* TODO: Add UTF-8 decoding */
41 	*r = *s;
42 	return 1;
43 }
44 
45 #define REPINF 255
46 #define MAXTHREAD 1000
47 #define MAXSUB REG_MAXSUB
48 
49 typedef struct Reclass Reclass;
50 typedef struct Renode Renode;
51 typedef struct Reinst Reinst;
52 typedef struct Rethread Rethread;
53 typedef struct Restate Restate;
54 
55 struct Reclass {
56 	Rune *end;
57 	Rune spans[64];
58 };
59 
60 struct Restate {
61 	Reprog *prog;
62 	Renode *pstart, *pend;
63 
64 	const char *source;
65 	unsigned int ncclass;
66 	unsigned int nsub;
67 	Renode *sub[MAXSUB];
68 
69 	int lookahead;
70 	Rune yychar;
71 	Reclass *yycc;
72 	int yymin, yymax;
73 
74 	const char *error;
75 	jmp_buf kaboom;
76 };
77 
78 struct Reprog {
79 	Reinst *start, *end;
80 	int flags;
81 	unsigned int nsub;
82 	Reclass cclass[16];
83         Restate g;  /**< Upstream has this as a global variable */
84 };
85 
die(Restate * g,const char * message)86 static void die(Restate *g, const char *message)
87 {
88 	g->error = message;
89 	longjmp(g->kaboom, 1);
90 }
91 
canon(Rune c)92 static Rune canon(Rune c)
93 {
94 	Rune u = toupperrune(c);
95 	if (c >= 128 && u < 128)
96 		return c;
97 	return u;
98 }
99 
100 /* Scan */
101 
102 enum {
103 	L_CHAR = 256,
104 	L_CCLASS,	/* character class */
105 	L_NCCLASS,	/* negative character class */
106 	L_NC,		/* "(?:" no capture */
107 	L_PLA,		/* "(?=" positive lookahead */
108 	L_NLA,		/* "(?!" negative lookahead */
109 	L_WORD,		/* "\b" word boundary */
110 	L_NWORD,	/* "\B" non-word boundary */
111 	L_REF,		/* "\1" back-reference */
112 	L_COUNT		/* {M,N} */
113 };
114 
hex(Restate * g,int c)115 static int hex(Restate *g, int c)
116 {
117 	if (c >= '0' && c <= '9') return c - '0';
118 	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
119 	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
120 	die(g, "invalid escape sequence");
121 	return 0;
122 }
123 
dec(Restate * g,int c)124 static int dec(Restate *g, int c)
125 {
126 	if (c >= '0' && c <= '9') return c - '0';
127 	die(g, "invalid quantifier");
128 	return 0;
129 }
130 
131 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
132 
nextrune(Restate * g)133 static int nextrune(Restate *g)
134 {
135 	g->source += chartorune(&g->yychar, g->source);
136 	if (g->yychar == '\\') {
137 		g->source += chartorune(&g->yychar, g->source);
138 		switch (g->yychar) {
139 		case 0: die(g, "unterminated escape sequence");
140 		case 'f': g->yychar = '\f'; return 0;
141 		case 'n': g->yychar = '\n'; return 0;
142 		case 'r': g->yychar = '\r'; return 0;
143 		case 't': g->yychar = '\t'; return 0;
144 		case 'v': g->yychar = '\v'; return 0;
145 		case 'c':
146 			g->yychar = (*g->source++) & 31;
147 			return 0;
148 		case 'x':
149 			g->yychar = hex(g, *g->source++) << 4;
150 			g->yychar += hex(g, *g->source++);
151 			if (g->yychar == 0) {
152 				g->yychar = '0';
153 				return 1;
154 			}
155 			return 0;
156 		case 'u':
157 			g->yychar = hex(g, *g->source++) << 12;
158 			g->yychar += hex(g, *g->source++) << 8;
159 			g->yychar += hex(g, *g->source++) << 4;
160 			g->yychar += hex(g, *g->source++);
161 			if (g->yychar == 0) {
162 				g->yychar = '0';
163 				return 1;
164 			}
165 			return 0;
166 		}
167 		if (strchr(ESCAPES, g->yychar))
168 			return 1;
169 		if (isalpharune(g->yychar) || g->yychar == '_') /* check identity escape */
170 			die(g, "invalid escape character");
171 		return 0;
172 	}
173 	return 0;
174 }
175 
lexcount(Restate * g)176 static int lexcount(Restate *g)
177 {
178 	g->yychar = *g->source++;
179 
180 	g->yymin = dec(g, g->yychar);
181 	g->yychar = *g->source++;
182 	while (g->yychar != ',' && g->yychar != '}') {
183 		g->yymin = g->yymin * 10 + dec(g, g->yychar);
184 		g->yychar = *g->source++;
185 	}
186 	if (g->yymin >= REPINF)
187 		die(g, "numeric overflow");
188 
189 	if (g->yychar == ',') {
190 		g->yychar = *g->source++;
191 		if (g->yychar == '}') {
192 			g->yymax = REPINF;
193 		} else {
194 			g->yymax = dec(g, g->yychar);
195 			g->yychar = *g->source++;
196 			while (g->yychar != '}') {
197 				g->yymax = g->yymax * 10 + dec(g, g->yychar);
198 				g->yychar = *g->source++;
199 			}
200 			if (g->yymax >= REPINF)
201 				die(g, "numeric overflow");
202 		}
203 	} else {
204 		g->yymax = g->yymin;
205 	}
206 
207 	return L_COUNT;
208 }
209 
newcclass(Restate * g)210 static void newcclass(Restate *g)
211 {
212 	if (g->ncclass >= nelem(g->prog->cclass))
213 		die(g, "too many character classes");
214 	g->yycc = g->prog->cclass + g->ncclass++;
215 	g->yycc->end = g->yycc->spans;
216 }
217 
addrange(Restate * g,Rune a,Rune b)218 static void addrange(Restate *g, Rune a, Rune b)
219 {
220 	if (a > b)
221 		die(g, "invalid character class range");
222 	if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
223 		die(g, "too many character class ranges");
224 	*g->yycc->end++ = a;
225 	*g->yycc->end++ = b;
226 }
227 
addranges_d(Restate * g)228 static void addranges_d(Restate *g)
229 {
230 	addrange(g, '0', '9');
231 }
232 
addranges_D(Restate * g)233 static void addranges_D(Restate *g)
234 {
235 	addrange(g, 0, '0'-1);
236 	addrange(g, '9'+1, 0xFFFF);
237 }
238 
addranges_s(Restate * g)239 static void addranges_s(Restate *g)
240 {
241 	addrange(g, 0x9, 0x9);
242 	addrange(g, 0xA, 0xD);
243 	addrange(g, 0x20, 0x20);
244 	addrange(g, 0xA0, 0xA0);
245 	addrange(g, 0x2028, 0x2029);
246 	addrange(g, 0xFEFF, 0xFEFF);
247 }
248 
addranges_S(Restate * g)249 static void addranges_S(Restate *g)
250 {
251 	addrange(g, 0, 0x9-1);
252 	addrange(g, 0x9+1, 0xA-1);
253 	addrange(g, 0xD+1, 0x20-1);
254 	addrange(g, 0x20+1, 0xA0-1);
255 	addrange(g, 0xA0+1, 0x2028-1);
256 	addrange(g, 0x2029+1, 0xFEFF-1);
257 	addrange(g, 0xFEFF+1, 0xFFFF);
258 }
259 
addranges_w(Restate * g)260 static void addranges_w(Restate *g)
261 {
262 	addrange(g, '0', '9');
263 	addrange(g, 'A', 'Z');
264 	addrange(g, '_', '_');
265 	addrange(g, 'a', 'z');
266 }
267 
addranges_W(Restate * g)268 static void addranges_W(Restate *g)
269 {
270 	addrange(g, 0, '0'-1);
271 	addrange(g, '9'+1, 'A'-1);
272 	addrange(g, 'Z'+1, '_'-1);
273 	addrange(g, '_'+1, 'a'-1);
274 	addrange(g, 'z'+1, 0xFFFF);
275 }
276 
lexclass(Restate * g)277 static int lexclass(Restate *g)
278 {
279 	int type = L_CCLASS;
280 	int quoted, havesave, havedash;
281 	Rune save = 0;
282 
283 	newcclass(g);
284 
285 	quoted = nextrune(g);
286 	if (!quoted && g->yychar == '^') {
287 		type = L_NCCLASS;
288 		quoted = nextrune(g);
289 	}
290 
291 	havesave = havedash = 0;
292 	for (;;) {
293 		if (g->yychar == 0)
294 			die(g, "unterminated character class");
295 		if (!quoted && g->yychar == ']')
296 			break;
297 
298 		if (!quoted && g->yychar == '-') {
299 			if (havesave) {
300 				if (havedash) {
301 					addrange(g, save, '-');
302 					havesave = havedash = 0;
303 				} else {
304 					havedash = 1;
305 				}
306 			} else {
307 				save = '-';
308 				havesave = 1;
309 			}
310 		} else if (quoted && strchr("DSWdsw", g->yychar)) {
311 			if (havesave) {
312 				addrange(g, save, save);
313 				if (havedash)
314 					addrange(g, '-', '-');
315 			}
316 			switch (g->yychar) {
317 			case 'd': addranges_d(g); break;
318 			case 's': addranges_s(g); break;
319 			case 'w': addranges_w(g); break;
320 			case 'D': addranges_D(g); break;
321 			case 'S': addranges_S(g); break;
322 			case 'W': addranges_W(g); break;
323 			}
324 			havesave = havedash = 0;
325 		} else {
326 			if (quoted) {
327 				if (g->yychar == 'b')
328 					g->yychar = '\b';
329 				else if (g->yychar == '0')
330 					g->yychar = 0;
331 				/* else identity escape */
332 			}
333 			if (havesave) {
334 				if (havedash) {
335 					addrange(g, save, g->yychar);
336 					havesave = havedash = 0;
337 				} else {
338 					addrange(g, save, save);
339 					save = g->yychar;
340 				}
341 			} else {
342 				save = g->yychar;
343 				havesave = 1;
344 			}
345 		}
346 
347 		quoted = nextrune(g);
348 	}
349 
350 	if (havesave) {
351 		addrange(g, save, save);
352 		if (havedash)
353 			addrange(g, '-', '-');
354 	}
355 
356 	return type;
357 }
358 
lex(Restate * g)359 static int lex(Restate *g)
360 {
361 	int quoted = nextrune(g);
362 	if (quoted) {
363 		switch (g->yychar) {
364 		case 'b': return L_WORD;
365 		case 'B': return L_NWORD;
366 		case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
367 		case 's': newcclass(g); addranges_s(g); return L_CCLASS;
368 		case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
369 		case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
370 		case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
371 		case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
372 		case '0': g->yychar = 0; return L_CHAR;
373 		}
374 		if (g->yychar >= '0' && g->yychar <= '9') {
375 			g->yychar -= '0';
376 			if (*g->source >= '0' && *g->source <= '9')
377 				g->yychar = g->yychar * 10 + *g->source++ - '0';
378 			return L_REF;
379 		}
380 		return L_CHAR;
381 	}
382 
383 	switch (g->yychar) {
384 	case 0:
385 	case '$': case ')': case '*': case '+':
386 	case '.': case '?': case '^': case '|':
387 		return g->yychar;
388 	}
389 
390 	if (g->yychar == '{')
391 		return lexcount(g);
392 	if (g->yychar == '[')
393 		return lexclass(g);
394 	if (g->yychar == '(') {
395 		if (g->source[0] == '?') {
396 			if (g->source[1] == ':') {
397 				g->source += 2;
398 				return L_NC;
399 			}
400 			if (g->source[1] == '=') {
401 				g->source += 2;
402 				return L_PLA;
403 			}
404 			if (g->source[1] == '!') {
405 				g->source += 2;
406 				return L_NLA;
407 			}
408 		}
409 		return '(';
410 	}
411 
412 	return L_CHAR;
413 }
414 
415 /* Parse */
416 
417 enum {
418 	P_CAT, P_ALT, P_REP,
419 	P_BOL, P_EOL, P_WORD, P_NWORD,
420 	P_PAR, P_PLA, P_NLA,
421 	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
422 	P_REF
423 };
424 
425 struct Renode {
426 	unsigned char type;
427 	unsigned char ng, m, n;
428 	Rune c;
429 	Reclass *cc;
430 	Renode *x;
431 	Renode *y;
432 };
433 
newnode(Restate * g,int type)434 static Renode *newnode(Restate *g, int type)
435 {
436 	Renode *node = g->pend++;
437 	node->type = type;
438 	node->cc = NULL;
439 	node->c = 0;
440 	node->ng = 0;
441 	node->m = 0;
442 	node->n = 0;
443 	node->x = node->y = NULL;
444 	return node;
445 }
446 
empty(Renode * node)447 static int empty(Renode *node)
448 {
449 	if (!node) return 1;
450 	switch (node->type) {
451 	default: return 1;
452 	case P_CAT: return empty(node->x) && empty(node->y);
453 	case P_ALT: return empty(node->x) || empty(node->y);
454 	case P_REP: return empty(node->x) || node->m == 0;
455 	case P_PAR: return empty(node->x);
456 	case P_REF: return empty(node->x);
457 	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
458 	}
459 }
460 
newrep(Restate * g,Renode * atom,int ng,int min,int max)461 static Renode *newrep(Restate *g, Renode *atom, int ng, int min, int max)
462 {
463 	Renode *rep = newnode(g, P_REP);
464 	if (max == REPINF && empty(atom))
465 		die(g, "infinite loop matching the empty string");
466 	rep->ng = ng;
467 	rep->m = min;
468 	rep->n = max;
469 	rep->x = atom;
470 	return rep;
471 }
472 
next(Restate * g)473 static void next(Restate *g)
474 {
475 	g->lookahead = lex(g);
476 }
477 
re_accept(Restate * g,int t)478 static int re_accept(Restate *g, int t)
479 {
480 	if (g->lookahead == t) {
481 		next(g);
482 		return 1;
483 	}
484 	return 0;
485 }
486 
487 static Renode *parsealt(Restate *g);
488 
parseatom(Restate * g)489 static Renode *parseatom(Restate *g)
490 {
491 	Renode *atom;
492 	if (g->lookahead == L_CHAR) {
493 		atom = newnode(g, P_CHAR);
494 		atom->c = g->yychar;
495 		next(g);
496 		return atom;
497 	}
498 	if (g->lookahead == L_CCLASS) {
499 		atom = newnode(g, P_CCLASS);
500 		atom->cc = g->yycc;
501 		next(g);
502 		return atom;
503 	}
504 	if (g->lookahead == L_NCCLASS) {
505 		atom = newnode(g, P_NCCLASS);
506 		atom->cc = g->yycc;
507 		next(g);
508 		return atom;
509 	}
510 	if (g->lookahead == L_REF) {
511 		atom = newnode(g, P_REF);
512 		if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
513 			die(g, "invalid back-reference");
514 		atom->n = g->yychar;
515 		atom->x = g->sub[g->yychar];
516 		next(g);
517 		return atom;
518 	}
519 	if (re_accept(g, '.'))
520 		return newnode(g, P_ANY);
521 	if (re_accept(g, '(')) {
522 		atom = newnode(g, P_PAR);
523 		if (g->nsub == MAXSUB)
524 			die(g, "too many captures");
525 		atom->n = g->nsub++;
526 		atom->x = parsealt(g);
527 		g->sub[atom->n] = atom;
528 		if (!re_accept(g, ')'))
529 			die(g, "unmatched '('");
530 		return atom;
531 	}
532 	if (re_accept(g, L_NC)) {
533 		atom = parsealt(g);
534 		if (!re_accept(g, ')'))
535 			die(g, "unmatched '('");
536 		return atom;
537 	}
538 	if (re_accept(g, L_PLA)) {
539 		atom = newnode(g, P_PLA);
540 		atom->x = parsealt(g);
541 		if (!re_accept(g, ')'))
542 			die(g, "unmatched '('");
543 		return atom;
544 	}
545 	if (re_accept(g, L_NLA)) {
546 		atom = newnode(g, P_NLA);
547 		atom->x = parsealt(g);
548 		if (!re_accept(g, ')'))
549 			die(g, "unmatched '('");
550 		return atom;
551 	}
552 	die(g, "syntax error");
553 	return NULL;
554 }
555 
parserep(Restate * g)556 static Renode *parserep(Restate *g)
557 {
558 	Renode *atom;
559 
560 	if (re_accept(g, '^')) return newnode(g, P_BOL);
561 	if (re_accept(g, '$')) return newnode(g, P_EOL);
562 	if (re_accept(g, L_WORD)) return newnode(g, P_WORD);
563 	if (re_accept(g, L_NWORD)) return newnode(g, P_NWORD);
564 
565 	atom = parseatom(g);
566 	if (g->lookahead == L_COUNT) {
567 		int min = g->yymin, max = g->yymax;
568 		next(g);
569 		if (max < min)
570 			die(g, "invalid quantifier");
571 		return newrep(g, atom, re_accept(g, '?'), min, max);
572 	}
573 	if (re_accept(g, '*')) return newrep(g, atom, re_accept(g, '?'), 0, REPINF);
574 	if (re_accept(g, '+')) return newrep(g, atom, re_accept(g, '?'), 1, REPINF);
575 	if (re_accept(g, '?')) return newrep(g, atom, re_accept(g, '?'), 0, 1);
576 	return atom;
577 }
578 
parsecat(Restate * g)579 static Renode *parsecat(Restate *g)
580 {
581 	Renode *cat, *x;
582 	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
583 		cat = parserep(g);
584 		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
585 			x = cat;
586 			cat = newnode(g, P_CAT);
587 			cat->x = x;
588 			cat->y = parserep(g);
589 		}
590 		return cat;
591 	}
592 	return NULL;
593 }
594 
parsealt(Restate * g)595 static Renode *parsealt(Restate *g)
596 {
597 	Renode *alt, *x;
598 	alt = parsecat(g);
599 	while (re_accept(g, '|')) {
600 		x = alt;
601 		alt = newnode(g, P_ALT);
602 		alt->x = x;
603 		alt->y = parsecat(g);
604 	}
605 	return alt;
606 }
607 
608 /* Compile */
609 
610 enum {
611 	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
612 	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
613 	I_BOL, I_EOL, I_WORD, I_NWORD,
614 	I_LPAR, I_RPAR
615 };
616 
617 struct Reinst {
618 	unsigned char opcode;
619 	unsigned char n;
620 	Rune c;
621 	Reclass *cc;
622 	Reinst *x;
623 	Reinst *y;
624 };
625 
count(Renode * node)626 static unsigned int count(Renode *node)
627 {
628 	unsigned int min, max;
629 	if (!node) return 0;
630 	switch (node->type) {
631 	default: return 1;
632 	case P_CAT: return count(node->x) + count(node->y);
633 	case P_ALT: return count(node->x) + count(node->y) + 2;
634 	case P_REP:
635 		min = node->m;
636 		max = node->n;
637 		if (min == max) return count(node->x) * min;
638 		if (max < REPINF) return count(node->x) * max + (max - min);
639 		return count(node->x) * (min + 1) + 2;
640 	case P_PAR: return count(node->x) + 2;
641 	case P_PLA: return count(node->x) + 2;
642 	case P_NLA: return count(node->x) + 2;
643 	}
644 }
645 
emit(Reprog * prog,int opcode)646 static Reinst *emit(Reprog *prog, int opcode)
647 {
648 	Reinst *inst = prog->end++;
649 	inst->opcode = opcode;
650 	inst->n = 0;
651 	inst->c = 0;
652 	inst->cc = NULL;
653 	inst->x = inst->y = NULL;
654 	return inst;
655 }
656 
compile(Reprog * prog,Renode * node)657 static void compile(Reprog *prog, Renode *node)
658 {
659 	Reinst *inst, *split, *jump;
660 	unsigned int i;
661 
662 	if (!node)
663 		return;
664 
665 	switch (node->type) {
666 	case P_CAT:
667 		compile(prog, node->x);
668 		compile(prog, node->y);
669 		break;
670 
671 	case P_ALT:
672 		split = emit(prog, I_SPLIT);
673 		compile(prog, node->x);
674 		jump = emit(prog, I_JUMP);
675 		compile(prog, node->y);
676 		split->x = split + 1;
677 		split->y = jump + 1;
678 		jump->x = prog->end;
679 		break;
680 
681 	case P_REP:
682 		for (i = 0; i < node->m; ++i) {
683 			inst = prog->end;
684 			compile(prog, node->x);
685 		}
686 		if (node->m == node->n)
687 			break;
688 		if (node->n < REPINF) {
689 			for (i = node->m; i < node->n; ++i) {
690 				split = emit(prog, I_SPLIT);
691 				compile(prog, node->x);
692 				if (node->ng) {
693 					split->y = split + 1;
694 					split->x = prog->end;
695 				} else {
696 					split->x = split + 1;
697 					split->y = prog->end;
698 				}
699 			}
700 		} else if (node->m == 0) {
701 			split = emit(prog, I_SPLIT);
702 			compile(prog, node->x);
703 			jump = emit(prog, I_JUMP);
704 			if (node->ng) {
705 				split->y = split + 1;
706 				split->x = prog->end;
707 			} else {
708 				split->x = split + 1;
709 				split->y = prog->end;
710 			}
711 			jump->x = split;
712 		} else {
713 			split = emit(prog, I_SPLIT);
714 			if (node->ng) {
715 				split->y = inst;
716 				split->x = prog->end;
717 			} else {
718 				split->x = inst;
719 				split->y = prog->end;
720 			}
721 		}
722 		break;
723 
724 	case P_BOL: emit(prog, I_BOL); break;
725 	case P_EOL: emit(prog, I_EOL); break;
726 	case P_WORD: emit(prog, I_WORD); break;
727 	case P_NWORD: emit(prog, I_NWORD); break;
728 
729 	case P_PAR:
730 		inst = emit(prog, I_LPAR);
731 		inst->n = node->n;
732 		compile(prog, node->x);
733 		inst = emit(prog, I_RPAR);
734 		inst->n = node->n;
735 		break;
736 	case P_PLA:
737 		split = emit(prog, I_PLA);
738 		compile(prog, node->x);
739 		emit(prog, I_END);
740 		split->x = split + 1;
741 		split->y = prog->end;
742 		break;
743 	case P_NLA:
744 		split = emit(prog, I_NLA);
745 		compile(prog, node->x);
746 		emit(prog, I_END);
747 		split->x = split + 1;
748 		split->y = prog->end;
749 		break;
750 
751 	case P_ANY:
752 		emit(prog, I_ANY);
753 		break;
754 	case P_CHAR:
755 		inst = emit(prog, I_CHAR);
756 		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
757 		break;
758 	case P_CCLASS:
759 		inst = emit(prog, I_CCLASS);
760 		inst->cc = node->cc;
761 		break;
762 	case P_NCCLASS:
763 		inst = emit(prog, I_NCCLASS);
764 		inst->cc = node->cc;
765 		break;
766 	case P_REF:
767 		inst = emit(prog, I_REF);
768 		inst->n = node->n;
769 		break;
770 	}
771 }
772 
773 #ifdef TEST
dumpnode(Renode * node)774 static void dumpnode(Renode *node)
775 {
776 	Rune *p;
777 	if (!node) { printf("Empty"); return; }
778 	switch (node->type) {
779 	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
780 	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
781 	case P_REP:
782 		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
783 		dumpnode(node->x);
784 		printf(")");
785 		break;
786 	case P_BOL: printf("Bol"); break;
787 	case P_EOL: printf("Eol"); break;
788 	case P_WORD: printf("Word"); break;
789 	case P_NWORD: printf("NotWord"); break;
790 	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
791 	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
792 	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
793 	case P_ANY: printf("Any"); break;
794 	case P_CHAR: printf("Char(%c)", node->c); break;
795 	case P_CCLASS:
796 		printf("Class(");
797 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
798 		printf(")");
799 		break;
800 	case P_NCCLASS:
801 		printf("NotClass(");
802 		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
803 		printf(")");
804 		break;
805 	case P_REF: printf("Ref(%d)", node->n); break;
806 	}
807 }
808 
dumpprog(Reprog * prog)809 static void dumpprog(Reprog *prog)
810 {
811 	Reinst *inst;
812 	int i;
813 	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
814 		printf("% 5d: ", i);
815 		switch (inst->opcode) {
816 		case I_END: puts("end"); break;
817 		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
818 		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
819 		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
820 		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
821 		case I_ANY: puts("any"); break;
822 		case I_ANYNL: puts("anynl"); break;
823 		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
824 		case I_CCLASS: puts("cclass"); break;
825 		case I_NCCLASS: puts("ncclass"); break;
826 		case I_REF: printf("ref %d\n", inst->n); break;
827 		case I_BOL: puts("bol"); break;
828 		case I_EOL: puts("eol"); break;
829 		case I_WORD: puts("word"); break;
830 		case I_NWORD: puts("nword"); break;
831 		case I_LPAR: printf("lpar %d\n", inst->n); break;
832 		case I_RPAR: printf("rpar %d\n", inst->n); break;
833 		}
834 	}
835 }
836 #endif
837 
re_regcomp(const char * pattern,int cflags,const char ** errorp)838 Reprog *re_regcomp(const char *pattern, int cflags, const char **errorp)
839 {
840         Reprog *prog;
841         Restate *g;
842 	Renode *node;
843 	Reinst *split, *jump;
844 	int i;
845         unsigned int ncount;
846         size_t pattern_len = strlen(pattern);
847 
848         if (pattern_len > 10000) {
849                 /* Avoid stack exhaustion in recursive parseatom() et.al. */
850                 if (errorp)
851                         *errorp = "regexp pattern too long (max 10000)";
852                 return NULL;
853         }
854 
855 	prog = rd_calloc(1, sizeof (Reprog));
856         g = &prog->g;
857         g->prog = prog;
858 	g->pstart = g->pend = rd_malloc(sizeof (Renode) * pattern_len * 2);
859 
860 	if (setjmp(g->kaboom)) {
861 		if (errorp) *errorp = g->error;
862 		rd_free(g->pstart);
863 		rd_free(prog);
864 		return NULL;
865 	}
866 
867 	g->source = pattern;
868 	g->ncclass = 0;
869 	g->nsub = 1;
870 	for (i = 0; i < MAXSUB; ++i)
871 		g->sub[i] = 0;
872 
873 	g->prog->flags = cflags;
874 
875 	next(g);
876 	node = parsealt(g);
877 	if (g->lookahead == ')')
878 		die(g, "unmatched ')'");
879 	if (g->lookahead != 0)
880 		die(g, "syntax error");
881 
882 	g->prog->nsub = g->nsub;
883         ncount = count(node);
884         if (ncount > 10000)
885                 die(g, "regexp graph too large");
886         g->prog->start = g->prog->end = rd_malloc((ncount + 6) * sizeof (Reinst));
887 
888 	split = emit(g->prog, I_SPLIT);
889 	split->x = split + 3;
890 	split->y = split + 1;
891 	emit(g->prog, I_ANYNL);
892 	jump = emit(g->prog, I_JUMP);
893 	jump->x = split;
894 	emit(g->prog, I_LPAR);
895 	compile(g->prog, node);
896 	emit(g->prog, I_RPAR);
897 	emit(g->prog, I_END);
898 
899 #ifdef TEST
900 	dumpnode(node);
901 	putchar('\n');
902 	dumpprog(g->prog);
903 #endif
904 
905 	rd_free(g->pstart);
906 
907 	if (errorp) *errorp = NULL;
908 	return g->prog;
909 }
910 
re_regfree(Reprog * prog)911 void re_regfree(Reprog *prog)
912 {
913 	if (prog) {
914 		rd_free(prog->start);
915 		rd_free(prog);
916 	}
917 }
918 
919 /* Match */
920 
isnewline(int c)921 static int isnewline(int c)
922 {
923 	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
924 }
925 
iswordchar(int c)926 static int iswordchar(int c)
927 {
928 	return c == '_' ||
929 		(c >= 'a' && c <= 'z') ||
930 		(c >= 'A' && c <= 'Z') ||
931 		(c >= '0' && c <= '9');
932 }
933 
incclass(Reclass * cc,Rune c)934 static int incclass(Reclass *cc, Rune c)
935 {
936 	Rune *p;
937 	for (p = cc->spans; p < cc->end; p += 2)
938 		if (p[0] <= c && c <= p[1])
939 			return 1;
940 	return 0;
941 }
942 
incclasscanon(Reclass * cc,Rune c)943 static int incclasscanon(Reclass *cc, Rune c)
944 {
945 	Rune *p, r;
946 	for (p = cc->spans; p < cc->end; p += 2)
947 		for (r = p[0]; r <= p[1]; ++r)
948 			if (c == canon(r))
949 				return 1;
950 	return 0;
951 }
952 
strncmpcanon(const char * a,const char * b,unsigned int n)953 static int strncmpcanon(const char *a, const char *b, unsigned int n)
954 {
955 	Rune ra, rb;
956 	int c;
957 	while (n--) {
958 		if (!*a) return -1;
959 		if (!*b) return 1;
960 		a += chartorune(&ra, a);
961 		b += chartorune(&rb, b);
962 		c = canon(ra) - canon(rb);
963 		if (c)
964 			return c;
965 	}
966 	return 0;
967 }
968 
969 struct Rethread {
970 	Reinst *pc;
971 	const char *sp;
972 	Resub sub;
973 };
974 
spawn(Rethread * t,Reinst * pc,const char * sp,Resub * sub)975 static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
976 {
977 	t->pc = pc;
978 	t->sp = sp;
979 	memcpy(&t->sub, sub, sizeof t->sub);
980 }
981 
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out)982 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
983 {
984 	Rethread ready[MAXTHREAD];
985 	Resub scratch;
986 	Resub sub;
987 	Rune c;
988 	unsigned int nready;
989 	int i;
990 
991 	/* queue initial thread */
992 	spawn(ready + 0, pc, sp, out);
993 	nready = 1;
994 
995 	/* run threads in stack order */
996 	while (nready > 0) {
997 		--nready;
998 		pc = ready[nready].pc;
999 		sp = ready[nready].sp;
1000 		memcpy(&sub, &ready[nready].sub, sizeof sub);
1001 		for (;;) {
1002 			switch (pc->opcode) {
1003 			case I_END:
1004 				for (i = 0; i < MAXSUB; ++i) {
1005 					out->sub[i].sp = sub.sub[i].sp;
1006 					out->sub[i].ep = sub.sub[i].ep;
1007 				}
1008 				return 1;
1009 			case I_JUMP:
1010 				pc = pc->x;
1011 				continue;
1012 			case I_SPLIT:
1013 				if (nready >= MAXTHREAD) {
1014 					fprintf(stderr, "regexec: backtrack overflow!\n");
1015 					return 0;
1016 				}
1017 				spawn(&ready[nready++], pc->y, sp, &sub);
1018 				pc = pc->x;
1019 				continue;
1020 
1021 			case I_PLA:
1022 				if (!match(pc->x, sp, bol, flags, &sub))
1023 					goto dead;
1024 				pc = pc->y;
1025 				continue;
1026 			case I_NLA:
1027 				memcpy(&scratch, &sub, sizeof scratch);
1028 				if (match(pc->x, sp, bol, flags, &scratch))
1029 					goto dead;
1030 				pc = pc->y;
1031 				continue;
1032 
1033 			case I_ANYNL:
1034 				sp += chartorune(&c, sp);
1035 				if (c == 0)
1036 					goto dead;
1037 				break;
1038 			case I_ANY:
1039 				sp += chartorune(&c, sp);
1040 				if (c == 0)
1041 					goto dead;
1042 				if (isnewline(c))
1043 					goto dead;
1044 				break;
1045 			case I_CHAR:
1046 				sp += chartorune(&c, sp);
1047 				if (c == 0)
1048 					goto dead;
1049 				if (flags & REG_ICASE)
1050 					c = canon(c);
1051 				if (c != pc->c)
1052 					goto dead;
1053 				break;
1054 			case I_CCLASS:
1055 				sp += chartorune(&c, sp);
1056 				if (c == 0)
1057 					goto dead;
1058 				if (flags & REG_ICASE) {
1059 					if (!incclasscanon(pc->cc, canon(c)))
1060 						goto dead;
1061 				} else {
1062 					if (!incclass(pc->cc, c))
1063 						goto dead;
1064 				}
1065 				break;
1066 			case I_NCCLASS:
1067 				sp += chartorune(&c, sp);
1068 				if (c == 0)
1069 					goto dead;
1070 				if (flags & REG_ICASE) {
1071 					if (incclasscanon(pc->cc, canon(c)))
1072 						goto dead;
1073 				} else {
1074 					if (incclass(pc->cc, c))
1075 						goto dead;
1076 				}
1077 				break;
1078 			case I_REF:
1079 				i = (int)(sub.sub[pc->n].ep - sub.sub[pc->n].sp);
1080 				if (flags & REG_ICASE) {
1081 					if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
1082 						goto dead;
1083 				} else {
1084 					if (strncmp(sp, sub.sub[pc->n].sp, i))
1085 						goto dead;
1086 				}
1087 				if (i > 0)
1088 					sp += i;
1089 				break;
1090 
1091 			case I_BOL:
1092 				if (sp == bol && !(flags & REG_NOTBOL))
1093 					break;
1094 				if (flags & REG_NEWLINE)
1095 					if (sp > bol && isnewline(sp[-1]))
1096 						break;
1097 				goto dead;
1098 			case I_EOL:
1099 				if (*sp == 0)
1100 					break;
1101 				if (flags & REG_NEWLINE)
1102 					if (isnewline(*sp))
1103 						break;
1104 				goto dead;
1105 			case I_WORD:
1106 				i = sp > bol && iswordchar(sp[-1]);
1107 				i ^= iswordchar(sp[0]);
1108 				if (i)
1109 					break;
1110 				goto dead;
1111 			case I_NWORD:
1112 				i = sp > bol && iswordchar(sp[-1]);
1113 				i ^= iswordchar(sp[0]);
1114 				if (!i)
1115 					break;
1116 				goto dead;
1117 
1118 			case I_LPAR:
1119 				sub.sub[pc->n].sp = sp;
1120 				break;
1121 			case I_RPAR:
1122 				sub.sub[pc->n].ep = sp;
1123 				break;
1124 			default:
1125 				goto dead;
1126 			}
1127 			pc = pc + 1;
1128 		}
1129 dead: ;
1130 	}
1131 	return 0;
1132 }
1133 
re_regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1134 int re_regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1135 {
1136 	Resub scratch;
1137 	int i;
1138 
1139 	if (!sub)
1140 		sub = &scratch;
1141 
1142 	sub->nsub = prog->nsub;
1143 	for (i = 0; i < MAXSUB; ++i)
1144 		sub->sub[i].sp = sub->sub[i].ep = NULL;
1145 
1146 	return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1147 }
1148 
1149 #ifdef TEST
main(int argc,char ** argv)1150 int main(int argc, char **argv)
1151 {
1152 	const char *error;
1153 	const char *s;
1154 	Reprog *p;
1155 	Resub m;
1156 	unsigned int i;
1157 
1158 	if (argc > 1) {
1159 		p = regcomp(argv[1], 0, &error);
1160 		if (!p) {
1161 			fprintf(stderr, "regcomp: %s\n", error);
1162 			return 1;
1163 		}
1164 
1165 		if (argc > 2) {
1166 			s = argv[2];
1167 			printf("nsub = %d\n", p->nsub);
1168 			if (!regexec(p, s, &m, 0)) {
1169 				for (i = 0; i < m.nsub; ++i) {
1170 					int n = m.sub[i].ep - m.sub[i].sp;
1171 					if (n > 0)
1172 						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1173 					else
1174 						printf("match %d: n=0 ''\n", i);
1175 				}
1176 			} else {
1177 				printf("no match\n");
1178 			}
1179 		}
1180 	}
1181 
1182 	return 0;
1183 }
1184 #endif
1185