1 /**
2 * Copyright: public domain
3 *
4 * From https://github.com/ccxvii/minilibs sha 875c33568b5a4aa4fb3dd0c52ea98f7f0e5ca684:
5 *
6 * These libraries are in the public domain (or the equivalent where that is not possible).
7 * You can do anything you want with them. You have no legal obligation to do anything else,
8 * although I appreciate attribution.
9 */
10
11 #include "rd.h"
12
13 #include <stdlib.h>
14 #include <string.h>
15 #include <setjmp.h>
16 #include <stdio.h>
17
18 #include "regexp.h"
19
20 #define nelem(a) (sizeof (a) / sizeof (a)[0])
21
22 typedef unsigned int Rune;
23
isalpharune(Rune c)24 static int isalpharune(Rune c)
25 {
26 /* TODO: Add unicode support */
27 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
28 }
29
toupperrune(Rune c)30 static Rune toupperrune(Rune c)
31 {
32 /* TODO: Add unicode support */
33 if (c >= 'a' && c <= 'z')
34 return c - 'a' + 'A';
35 return c;
36 }
37
chartorune(Rune * r,const char * s)38 static int chartorune(Rune *r, const char *s)
39 {
40 /* TODO: Add UTF-8 decoding */
41 *r = *s;
42 return 1;
43 }
44
45 #define REPINF 255
46 #define MAXTHREAD 1000
47 #define MAXSUB REG_MAXSUB
48
49 typedef struct Reclass Reclass;
50 typedef struct Renode Renode;
51 typedef struct Reinst Reinst;
52 typedef struct Rethread Rethread;
53 typedef struct Restate Restate;
54
55 struct Reclass {
56 Rune *end;
57 Rune spans[64];
58 };
59
60 struct Restate {
61 Reprog *prog;
62 Renode *pstart, *pend;
63
64 const char *source;
65 unsigned int ncclass;
66 unsigned int nsub;
67 Renode *sub[MAXSUB];
68
69 int lookahead;
70 Rune yychar;
71 Reclass *yycc;
72 int yymin, yymax;
73
74 const char *error;
75 jmp_buf kaboom;
76 };
77
78 struct Reprog {
79 Reinst *start, *end;
80 int flags;
81 unsigned int nsub;
82 Reclass cclass[16];
83 Restate g; /**< Upstream has this as a global variable */
84 };
85
die(Restate * g,const char * message)86 static void die(Restate *g, const char *message)
87 {
88 g->error = message;
89 longjmp(g->kaboom, 1);
90 }
91
canon(Rune c)92 static Rune canon(Rune c)
93 {
94 Rune u = toupperrune(c);
95 if (c >= 128 && u < 128)
96 return c;
97 return u;
98 }
99
100 /* Scan */
101
102 enum {
103 L_CHAR = 256,
104 L_CCLASS, /* character class */
105 L_NCCLASS, /* negative character class */
106 L_NC, /* "(?:" no capture */
107 L_PLA, /* "(?=" positive lookahead */
108 L_NLA, /* "(?!" negative lookahead */
109 L_WORD, /* "\b" word boundary */
110 L_NWORD, /* "\B" non-word boundary */
111 L_REF, /* "\1" back-reference */
112 L_COUNT /* {M,N} */
113 };
114
hex(Restate * g,int c)115 static int hex(Restate *g, int c)
116 {
117 if (c >= '0' && c <= '9') return c - '0';
118 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
119 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
120 die(g, "invalid escape sequence");
121 return 0;
122 }
123
dec(Restate * g,int c)124 static int dec(Restate *g, int c)
125 {
126 if (c >= '0' && c <= '9') return c - '0';
127 die(g, "invalid quantifier");
128 return 0;
129 }
130
131 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
132
nextrune(Restate * g)133 static int nextrune(Restate *g)
134 {
135 g->source += chartorune(&g->yychar, g->source);
136 if (g->yychar == '\\') {
137 g->source += chartorune(&g->yychar, g->source);
138 switch (g->yychar) {
139 case 0: die(g, "unterminated escape sequence");
140 case 'f': g->yychar = '\f'; return 0;
141 case 'n': g->yychar = '\n'; return 0;
142 case 'r': g->yychar = '\r'; return 0;
143 case 't': g->yychar = '\t'; return 0;
144 case 'v': g->yychar = '\v'; return 0;
145 case 'c':
146 g->yychar = (*g->source++) & 31;
147 return 0;
148 case 'x':
149 g->yychar = hex(g, *g->source++) << 4;
150 g->yychar += hex(g, *g->source++);
151 if (g->yychar == 0) {
152 g->yychar = '0';
153 return 1;
154 }
155 return 0;
156 case 'u':
157 g->yychar = hex(g, *g->source++) << 12;
158 g->yychar += hex(g, *g->source++) << 8;
159 g->yychar += hex(g, *g->source++) << 4;
160 g->yychar += hex(g, *g->source++);
161 if (g->yychar == 0) {
162 g->yychar = '0';
163 return 1;
164 }
165 return 0;
166 }
167 if (strchr(ESCAPES, g->yychar))
168 return 1;
169 if (isalpharune(g->yychar) || g->yychar == '_') /* check identity escape */
170 die(g, "invalid escape character");
171 return 0;
172 }
173 return 0;
174 }
175
lexcount(Restate * g)176 static int lexcount(Restate *g)
177 {
178 g->yychar = *g->source++;
179
180 g->yymin = dec(g, g->yychar);
181 g->yychar = *g->source++;
182 while (g->yychar != ',' && g->yychar != '}') {
183 g->yymin = g->yymin * 10 + dec(g, g->yychar);
184 g->yychar = *g->source++;
185 }
186 if (g->yymin >= REPINF)
187 die(g, "numeric overflow");
188
189 if (g->yychar == ',') {
190 g->yychar = *g->source++;
191 if (g->yychar == '}') {
192 g->yymax = REPINF;
193 } else {
194 g->yymax = dec(g, g->yychar);
195 g->yychar = *g->source++;
196 while (g->yychar != '}') {
197 g->yymax = g->yymax * 10 + dec(g, g->yychar);
198 g->yychar = *g->source++;
199 }
200 if (g->yymax >= REPINF)
201 die(g, "numeric overflow");
202 }
203 } else {
204 g->yymax = g->yymin;
205 }
206
207 return L_COUNT;
208 }
209
newcclass(Restate * g)210 static void newcclass(Restate *g)
211 {
212 if (g->ncclass >= nelem(g->prog->cclass))
213 die(g, "too many character classes");
214 g->yycc = g->prog->cclass + g->ncclass++;
215 g->yycc->end = g->yycc->spans;
216 }
217
addrange(Restate * g,Rune a,Rune b)218 static void addrange(Restate *g, Rune a, Rune b)
219 {
220 if (a > b)
221 die(g, "invalid character class range");
222 if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
223 die(g, "too many character class ranges");
224 *g->yycc->end++ = a;
225 *g->yycc->end++ = b;
226 }
227
addranges_d(Restate * g)228 static void addranges_d(Restate *g)
229 {
230 addrange(g, '0', '9');
231 }
232
addranges_D(Restate * g)233 static void addranges_D(Restate *g)
234 {
235 addrange(g, 0, '0'-1);
236 addrange(g, '9'+1, 0xFFFF);
237 }
238
addranges_s(Restate * g)239 static void addranges_s(Restate *g)
240 {
241 addrange(g, 0x9, 0x9);
242 addrange(g, 0xA, 0xD);
243 addrange(g, 0x20, 0x20);
244 addrange(g, 0xA0, 0xA0);
245 addrange(g, 0x2028, 0x2029);
246 addrange(g, 0xFEFF, 0xFEFF);
247 }
248
addranges_S(Restate * g)249 static void addranges_S(Restate *g)
250 {
251 addrange(g, 0, 0x9-1);
252 addrange(g, 0x9+1, 0xA-1);
253 addrange(g, 0xD+1, 0x20-1);
254 addrange(g, 0x20+1, 0xA0-1);
255 addrange(g, 0xA0+1, 0x2028-1);
256 addrange(g, 0x2029+1, 0xFEFF-1);
257 addrange(g, 0xFEFF+1, 0xFFFF);
258 }
259
addranges_w(Restate * g)260 static void addranges_w(Restate *g)
261 {
262 addrange(g, '0', '9');
263 addrange(g, 'A', 'Z');
264 addrange(g, '_', '_');
265 addrange(g, 'a', 'z');
266 }
267
addranges_W(Restate * g)268 static void addranges_W(Restate *g)
269 {
270 addrange(g, 0, '0'-1);
271 addrange(g, '9'+1, 'A'-1);
272 addrange(g, 'Z'+1, '_'-1);
273 addrange(g, '_'+1, 'a'-1);
274 addrange(g, 'z'+1, 0xFFFF);
275 }
276
lexclass(Restate * g)277 static int lexclass(Restate *g)
278 {
279 int type = L_CCLASS;
280 int quoted, havesave, havedash;
281 Rune save = 0;
282
283 newcclass(g);
284
285 quoted = nextrune(g);
286 if (!quoted && g->yychar == '^') {
287 type = L_NCCLASS;
288 quoted = nextrune(g);
289 }
290
291 havesave = havedash = 0;
292 for (;;) {
293 if (g->yychar == 0)
294 die(g, "unterminated character class");
295 if (!quoted && g->yychar == ']')
296 break;
297
298 if (!quoted && g->yychar == '-') {
299 if (havesave) {
300 if (havedash) {
301 addrange(g, save, '-');
302 havesave = havedash = 0;
303 } else {
304 havedash = 1;
305 }
306 } else {
307 save = '-';
308 havesave = 1;
309 }
310 } else if (quoted && strchr("DSWdsw", g->yychar)) {
311 if (havesave) {
312 addrange(g, save, save);
313 if (havedash)
314 addrange(g, '-', '-');
315 }
316 switch (g->yychar) {
317 case 'd': addranges_d(g); break;
318 case 's': addranges_s(g); break;
319 case 'w': addranges_w(g); break;
320 case 'D': addranges_D(g); break;
321 case 'S': addranges_S(g); break;
322 case 'W': addranges_W(g); break;
323 }
324 havesave = havedash = 0;
325 } else {
326 if (quoted) {
327 if (g->yychar == 'b')
328 g->yychar = '\b';
329 else if (g->yychar == '0')
330 g->yychar = 0;
331 /* else identity escape */
332 }
333 if (havesave) {
334 if (havedash) {
335 addrange(g, save, g->yychar);
336 havesave = havedash = 0;
337 } else {
338 addrange(g, save, save);
339 save = g->yychar;
340 }
341 } else {
342 save = g->yychar;
343 havesave = 1;
344 }
345 }
346
347 quoted = nextrune(g);
348 }
349
350 if (havesave) {
351 addrange(g, save, save);
352 if (havedash)
353 addrange(g, '-', '-');
354 }
355
356 return type;
357 }
358
lex(Restate * g)359 static int lex(Restate *g)
360 {
361 int quoted = nextrune(g);
362 if (quoted) {
363 switch (g->yychar) {
364 case 'b': return L_WORD;
365 case 'B': return L_NWORD;
366 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
367 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
368 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
369 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
370 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
371 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
372 case '0': g->yychar = 0; return L_CHAR;
373 }
374 if (g->yychar >= '0' && g->yychar <= '9') {
375 g->yychar -= '0';
376 if (*g->source >= '0' && *g->source <= '9')
377 g->yychar = g->yychar * 10 + *g->source++ - '0';
378 return L_REF;
379 }
380 return L_CHAR;
381 }
382
383 switch (g->yychar) {
384 case 0:
385 case '$': case ')': case '*': case '+':
386 case '.': case '?': case '^': case '|':
387 return g->yychar;
388 }
389
390 if (g->yychar == '{')
391 return lexcount(g);
392 if (g->yychar == '[')
393 return lexclass(g);
394 if (g->yychar == '(') {
395 if (g->source[0] == '?') {
396 if (g->source[1] == ':') {
397 g->source += 2;
398 return L_NC;
399 }
400 if (g->source[1] == '=') {
401 g->source += 2;
402 return L_PLA;
403 }
404 if (g->source[1] == '!') {
405 g->source += 2;
406 return L_NLA;
407 }
408 }
409 return '(';
410 }
411
412 return L_CHAR;
413 }
414
415 /* Parse */
416
417 enum {
418 P_CAT, P_ALT, P_REP,
419 P_BOL, P_EOL, P_WORD, P_NWORD,
420 P_PAR, P_PLA, P_NLA,
421 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
422 P_REF
423 };
424
425 struct Renode {
426 unsigned char type;
427 unsigned char ng, m, n;
428 Rune c;
429 Reclass *cc;
430 Renode *x;
431 Renode *y;
432 };
433
newnode(Restate * g,int type)434 static Renode *newnode(Restate *g, int type)
435 {
436 Renode *node = g->pend++;
437 node->type = type;
438 node->cc = NULL;
439 node->c = 0;
440 node->ng = 0;
441 node->m = 0;
442 node->n = 0;
443 node->x = node->y = NULL;
444 return node;
445 }
446
empty(Renode * node)447 static int empty(Renode *node)
448 {
449 if (!node) return 1;
450 switch (node->type) {
451 default: return 1;
452 case P_CAT: return empty(node->x) && empty(node->y);
453 case P_ALT: return empty(node->x) || empty(node->y);
454 case P_REP: return empty(node->x) || node->m == 0;
455 case P_PAR: return empty(node->x);
456 case P_REF: return empty(node->x);
457 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
458 }
459 }
460
newrep(Restate * g,Renode * atom,int ng,int min,int max)461 static Renode *newrep(Restate *g, Renode *atom, int ng, int min, int max)
462 {
463 Renode *rep = newnode(g, P_REP);
464 if (max == REPINF && empty(atom))
465 die(g, "infinite loop matching the empty string");
466 rep->ng = ng;
467 rep->m = min;
468 rep->n = max;
469 rep->x = atom;
470 return rep;
471 }
472
next(Restate * g)473 static void next(Restate *g)
474 {
475 g->lookahead = lex(g);
476 }
477
re_accept(Restate * g,int t)478 static int re_accept(Restate *g, int t)
479 {
480 if (g->lookahead == t) {
481 next(g);
482 return 1;
483 }
484 return 0;
485 }
486
487 static Renode *parsealt(Restate *g);
488
parseatom(Restate * g)489 static Renode *parseatom(Restate *g)
490 {
491 Renode *atom;
492 if (g->lookahead == L_CHAR) {
493 atom = newnode(g, P_CHAR);
494 atom->c = g->yychar;
495 next(g);
496 return atom;
497 }
498 if (g->lookahead == L_CCLASS) {
499 atom = newnode(g, P_CCLASS);
500 atom->cc = g->yycc;
501 next(g);
502 return atom;
503 }
504 if (g->lookahead == L_NCCLASS) {
505 atom = newnode(g, P_NCCLASS);
506 atom->cc = g->yycc;
507 next(g);
508 return atom;
509 }
510 if (g->lookahead == L_REF) {
511 atom = newnode(g, P_REF);
512 if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
513 die(g, "invalid back-reference");
514 atom->n = g->yychar;
515 atom->x = g->sub[g->yychar];
516 next(g);
517 return atom;
518 }
519 if (re_accept(g, '.'))
520 return newnode(g, P_ANY);
521 if (re_accept(g, '(')) {
522 atom = newnode(g, P_PAR);
523 if (g->nsub == MAXSUB)
524 die(g, "too many captures");
525 atom->n = g->nsub++;
526 atom->x = parsealt(g);
527 g->sub[atom->n] = atom;
528 if (!re_accept(g, ')'))
529 die(g, "unmatched '('");
530 return atom;
531 }
532 if (re_accept(g, L_NC)) {
533 atom = parsealt(g);
534 if (!re_accept(g, ')'))
535 die(g, "unmatched '('");
536 return atom;
537 }
538 if (re_accept(g, L_PLA)) {
539 atom = newnode(g, P_PLA);
540 atom->x = parsealt(g);
541 if (!re_accept(g, ')'))
542 die(g, "unmatched '('");
543 return atom;
544 }
545 if (re_accept(g, L_NLA)) {
546 atom = newnode(g, P_NLA);
547 atom->x = parsealt(g);
548 if (!re_accept(g, ')'))
549 die(g, "unmatched '('");
550 return atom;
551 }
552 die(g, "syntax error");
553 return NULL;
554 }
555
parserep(Restate * g)556 static Renode *parserep(Restate *g)
557 {
558 Renode *atom;
559
560 if (re_accept(g, '^')) return newnode(g, P_BOL);
561 if (re_accept(g, '$')) return newnode(g, P_EOL);
562 if (re_accept(g, L_WORD)) return newnode(g, P_WORD);
563 if (re_accept(g, L_NWORD)) return newnode(g, P_NWORD);
564
565 atom = parseatom(g);
566 if (g->lookahead == L_COUNT) {
567 int min = g->yymin, max = g->yymax;
568 next(g);
569 if (max < min)
570 die(g, "invalid quantifier");
571 return newrep(g, atom, re_accept(g, '?'), min, max);
572 }
573 if (re_accept(g, '*')) return newrep(g, atom, re_accept(g, '?'), 0, REPINF);
574 if (re_accept(g, '+')) return newrep(g, atom, re_accept(g, '?'), 1, REPINF);
575 if (re_accept(g, '?')) return newrep(g, atom, re_accept(g, '?'), 0, 1);
576 return atom;
577 }
578
parsecat(Restate * g)579 static Renode *parsecat(Restate *g)
580 {
581 Renode *cat, *x;
582 if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
583 cat = parserep(g);
584 while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
585 x = cat;
586 cat = newnode(g, P_CAT);
587 cat->x = x;
588 cat->y = parserep(g);
589 }
590 return cat;
591 }
592 return NULL;
593 }
594
parsealt(Restate * g)595 static Renode *parsealt(Restate *g)
596 {
597 Renode *alt, *x;
598 alt = parsecat(g);
599 while (re_accept(g, '|')) {
600 x = alt;
601 alt = newnode(g, P_ALT);
602 alt->x = x;
603 alt->y = parsecat(g);
604 }
605 return alt;
606 }
607
608 /* Compile */
609
610 enum {
611 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
612 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
613 I_BOL, I_EOL, I_WORD, I_NWORD,
614 I_LPAR, I_RPAR
615 };
616
617 struct Reinst {
618 unsigned char opcode;
619 unsigned char n;
620 Rune c;
621 Reclass *cc;
622 Reinst *x;
623 Reinst *y;
624 };
625
count(Renode * node)626 static unsigned int count(Renode *node)
627 {
628 unsigned int min, max;
629 if (!node) return 0;
630 switch (node->type) {
631 default: return 1;
632 case P_CAT: return count(node->x) + count(node->y);
633 case P_ALT: return count(node->x) + count(node->y) + 2;
634 case P_REP:
635 min = node->m;
636 max = node->n;
637 if (min == max) return count(node->x) * min;
638 if (max < REPINF) return count(node->x) * max + (max - min);
639 return count(node->x) * (min + 1) + 2;
640 case P_PAR: return count(node->x) + 2;
641 case P_PLA: return count(node->x) + 2;
642 case P_NLA: return count(node->x) + 2;
643 }
644 }
645
emit(Reprog * prog,int opcode)646 static Reinst *emit(Reprog *prog, int opcode)
647 {
648 Reinst *inst = prog->end++;
649 inst->opcode = opcode;
650 inst->n = 0;
651 inst->c = 0;
652 inst->cc = NULL;
653 inst->x = inst->y = NULL;
654 return inst;
655 }
656
compile(Reprog * prog,Renode * node)657 static void compile(Reprog *prog, Renode *node)
658 {
659 Reinst *inst, *split, *jump;
660 unsigned int i;
661
662 if (!node)
663 return;
664
665 switch (node->type) {
666 case P_CAT:
667 compile(prog, node->x);
668 compile(prog, node->y);
669 break;
670
671 case P_ALT:
672 split = emit(prog, I_SPLIT);
673 compile(prog, node->x);
674 jump = emit(prog, I_JUMP);
675 compile(prog, node->y);
676 split->x = split + 1;
677 split->y = jump + 1;
678 jump->x = prog->end;
679 break;
680
681 case P_REP:
682 for (i = 0; i < node->m; ++i) {
683 inst = prog->end;
684 compile(prog, node->x);
685 }
686 if (node->m == node->n)
687 break;
688 if (node->n < REPINF) {
689 for (i = node->m; i < node->n; ++i) {
690 split = emit(prog, I_SPLIT);
691 compile(prog, node->x);
692 if (node->ng) {
693 split->y = split + 1;
694 split->x = prog->end;
695 } else {
696 split->x = split + 1;
697 split->y = prog->end;
698 }
699 }
700 } else if (node->m == 0) {
701 split = emit(prog, I_SPLIT);
702 compile(prog, node->x);
703 jump = emit(prog, I_JUMP);
704 if (node->ng) {
705 split->y = split + 1;
706 split->x = prog->end;
707 } else {
708 split->x = split + 1;
709 split->y = prog->end;
710 }
711 jump->x = split;
712 } else {
713 split = emit(prog, I_SPLIT);
714 if (node->ng) {
715 split->y = inst;
716 split->x = prog->end;
717 } else {
718 split->x = inst;
719 split->y = prog->end;
720 }
721 }
722 break;
723
724 case P_BOL: emit(prog, I_BOL); break;
725 case P_EOL: emit(prog, I_EOL); break;
726 case P_WORD: emit(prog, I_WORD); break;
727 case P_NWORD: emit(prog, I_NWORD); break;
728
729 case P_PAR:
730 inst = emit(prog, I_LPAR);
731 inst->n = node->n;
732 compile(prog, node->x);
733 inst = emit(prog, I_RPAR);
734 inst->n = node->n;
735 break;
736 case P_PLA:
737 split = emit(prog, I_PLA);
738 compile(prog, node->x);
739 emit(prog, I_END);
740 split->x = split + 1;
741 split->y = prog->end;
742 break;
743 case P_NLA:
744 split = emit(prog, I_NLA);
745 compile(prog, node->x);
746 emit(prog, I_END);
747 split->x = split + 1;
748 split->y = prog->end;
749 break;
750
751 case P_ANY:
752 emit(prog, I_ANY);
753 break;
754 case P_CHAR:
755 inst = emit(prog, I_CHAR);
756 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
757 break;
758 case P_CCLASS:
759 inst = emit(prog, I_CCLASS);
760 inst->cc = node->cc;
761 break;
762 case P_NCCLASS:
763 inst = emit(prog, I_NCCLASS);
764 inst->cc = node->cc;
765 break;
766 case P_REF:
767 inst = emit(prog, I_REF);
768 inst->n = node->n;
769 break;
770 }
771 }
772
773 #ifdef TEST
dumpnode(Renode * node)774 static void dumpnode(Renode *node)
775 {
776 Rune *p;
777 if (!node) { printf("Empty"); return; }
778 switch (node->type) {
779 case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
780 case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
781 case P_REP:
782 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
783 dumpnode(node->x);
784 printf(")");
785 break;
786 case P_BOL: printf("Bol"); break;
787 case P_EOL: printf("Eol"); break;
788 case P_WORD: printf("Word"); break;
789 case P_NWORD: printf("NotWord"); break;
790 case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
791 case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
792 case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
793 case P_ANY: printf("Any"); break;
794 case P_CHAR: printf("Char(%c)", node->c); break;
795 case P_CCLASS:
796 printf("Class(");
797 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
798 printf(")");
799 break;
800 case P_NCCLASS:
801 printf("NotClass(");
802 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
803 printf(")");
804 break;
805 case P_REF: printf("Ref(%d)", node->n); break;
806 }
807 }
808
dumpprog(Reprog * prog)809 static void dumpprog(Reprog *prog)
810 {
811 Reinst *inst;
812 int i;
813 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
814 printf("% 5d: ", i);
815 switch (inst->opcode) {
816 case I_END: puts("end"); break;
817 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
818 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
819 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
820 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
821 case I_ANY: puts("any"); break;
822 case I_ANYNL: puts("anynl"); break;
823 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
824 case I_CCLASS: puts("cclass"); break;
825 case I_NCCLASS: puts("ncclass"); break;
826 case I_REF: printf("ref %d\n", inst->n); break;
827 case I_BOL: puts("bol"); break;
828 case I_EOL: puts("eol"); break;
829 case I_WORD: puts("word"); break;
830 case I_NWORD: puts("nword"); break;
831 case I_LPAR: printf("lpar %d\n", inst->n); break;
832 case I_RPAR: printf("rpar %d\n", inst->n); break;
833 }
834 }
835 }
836 #endif
837
re_regcomp(const char * pattern,int cflags,const char ** errorp)838 Reprog *re_regcomp(const char *pattern, int cflags, const char **errorp)
839 {
840 Reprog *prog;
841 Restate *g;
842 Renode *node;
843 Reinst *split, *jump;
844 int i;
845 unsigned int ncount;
846 size_t pattern_len = strlen(pattern);
847
848 if (pattern_len > 10000) {
849 /* Avoid stack exhaustion in recursive parseatom() et.al. */
850 if (errorp)
851 *errorp = "regexp pattern too long (max 10000)";
852 return NULL;
853 }
854
855 prog = rd_calloc(1, sizeof (Reprog));
856 g = &prog->g;
857 g->prog = prog;
858 g->pstart = g->pend = rd_malloc(sizeof (Renode) * pattern_len * 2);
859
860 if (setjmp(g->kaboom)) {
861 if (errorp) *errorp = g->error;
862 rd_free(g->pstart);
863 rd_free(prog);
864 return NULL;
865 }
866
867 g->source = pattern;
868 g->ncclass = 0;
869 g->nsub = 1;
870 for (i = 0; i < MAXSUB; ++i)
871 g->sub[i] = 0;
872
873 g->prog->flags = cflags;
874
875 next(g);
876 node = parsealt(g);
877 if (g->lookahead == ')')
878 die(g, "unmatched ')'");
879 if (g->lookahead != 0)
880 die(g, "syntax error");
881
882 g->prog->nsub = g->nsub;
883 ncount = count(node);
884 if (ncount > 10000)
885 die(g, "regexp graph too large");
886 g->prog->start = g->prog->end = rd_malloc((ncount + 6) * sizeof (Reinst));
887
888 split = emit(g->prog, I_SPLIT);
889 split->x = split + 3;
890 split->y = split + 1;
891 emit(g->prog, I_ANYNL);
892 jump = emit(g->prog, I_JUMP);
893 jump->x = split;
894 emit(g->prog, I_LPAR);
895 compile(g->prog, node);
896 emit(g->prog, I_RPAR);
897 emit(g->prog, I_END);
898
899 #ifdef TEST
900 dumpnode(node);
901 putchar('\n');
902 dumpprog(g->prog);
903 #endif
904
905 rd_free(g->pstart);
906
907 if (errorp) *errorp = NULL;
908 return g->prog;
909 }
910
re_regfree(Reprog * prog)911 void re_regfree(Reprog *prog)
912 {
913 if (prog) {
914 rd_free(prog->start);
915 rd_free(prog);
916 }
917 }
918
919 /* Match */
920
isnewline(int c)921 static int isnewline(int c)
922 {
923 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
924 }
925
iswordchar(int c)926 static int iswordchar(int c)
927 {
928 return c == '_' ||
929 (c >= 'a' && c <= 'z') ||
930 (c >= 'A' && c <= 'Z') ||
931 (c >= '0' && c <= '9');
932 }
933
incclass(Reclass * cc,Rune c)934 static int incclass(Reclass *cc, Rune c)
935 {
936 Rune *p;
937 for (p = cc->spans; p < cc->end; p += 2)
938 if (p[0] <= c && c <= p[1])
939 return 1;
940 return 0;
941 }
942
incclasscanon(Reclass * cc,Rune c)943 static int incclasscanon(Reclass *cc, Rune c)
944 {
945 Rune *p, r;
946 for (p = cc->spans; p < cc->end; p += 2)
947 for (r = p[0]; r <= p[1]; ++r)
948 if (c == canon(r))
949 return 1;
950 return 0;
951 }
952
strncmpcanon(const char * a,const char * b,unsigned int n)953 static int strncmpcanon(const char *a, const char *b, unsigned int n)
954 {
955 Rune ra, rb;
956 int c;
957 while (n--) {
958 if (!*a) return -1;
959 if (!*b) return 1;
960 a += chartorune(&ra, a);
961 b += chartorune(&rb, b);
962 c = canon(ra) - canon(rb);
963 if (c)
964 return c;
965 }
966 return 0;
967 }
968
969 struct Rethread {
970 Reinst *pc;
971 const char *sp;
972 Resub sub;
973 };
974
spawn(Rethread * t,Reinst * pc,const char * sp,Resub * sub)975 static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
976 {
977 t->pc = pc;
978 t->sp = sp;
979 memcpy(&t->sub, sub, sizeof t->sub);
980 }
981
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out)982 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
983 {
984 Rethread ready[MAXTHREAD];
985 Resub scratch;
986 Resub sub;
987 Rune c;
988 unsigned int nready;
989 int i;
990
991 /* queue initial thread */
992 spawn(ready + 0, pc, sp, out);
993 nready = 1;
994
995 /* run threads in stack order */
996 while (nready > 0) {
997 --nready;
998 pc = ready[nready].pc;
999 sp = ready[nready].sp;
1000 memcpy(&sub, &ready[nready].sub, sizeof sub);
1001 for (;;) {
1002 switch (pc->opcode) {
1003 case I_END:
1004 for (i = 0; i < MAXSUB; ++i) {
1005 out->sub[i].sp = sub.sub[i].sp;
1006 out->sub[i].ep = sub.sub[i].ep;
1007 }
1008 return 1;
1009 case I_JUMP:
1010 pc = pc->x;
1011 continue;
1012 case I_SPLIT:
1013 if (nready >= MAXTHREAD) {
1014 fprintf(stderr, "regexec: backtrack overflow!\n");
1015 return 0;
1016 }
1017 spawn(&ready[nready++], pc->y, sp, &sub);
1018 pc = pc->x;
1019 continue;
1020
1021 case I_PLA:
1022 if (!match(pc->x, sp, bol, flags, &sub))
1023 goto dead;
1024 pc = pc->y;
1025 continue;
1026 case I_NLA:
1027 memcpy(&scratch, &sub, sizeof scratch);
1028 if (match(pc->x, sp, bol, flags, &scratch))
1029 goto dead;
1030 pc = pc->y;
1031 continue;
1032
1033 case I_ANYNL:
1034 sp += chartorune(&c, sp);
1035 if (c == 0)
1036 goto dead;
1037 break;
1038 case I_ANY:
1039 sp += chartorune(&c, sp);
1040 if (c == 0)
1041 goto dead;
1042 if (isnewline(c))
1043 goto dead;
1044 break;
1045 case I_CHAR:
1046 sp += chartorune(&c, sp);
1047 if (c == 0)
1048 goto dead;
1049 if (flags & REG_ICASE)
1050 c = canon(c);
1051 if (c != pc->c)
1052 goto dead;
1053 break;
1054 case I_CCLASS:
1055 sp += chartorune(&c, sp);
1056 if (c == 0)
1057 goto dead;
1058 if (flags & REG_ICASE) {
1059 if (!incclasscanon(pc->cc, canon(c)))
1060 goto dead;
1061 } else {
1062 if (!incclass(pc->cc, c))
1063 goto dead;
1064 }
1065 break;
1066 case I_NCCLASS:
1067 sp += chartorune(&c, sp);
1068 if (c == 0)
1069 goto dead;
1070 if (flags & REG_ICASE) {
1071 if (incclasscanon(pc->cc, canon(c)))
1072 goto dead;
1073 } else {
1074 if (incclass(pc->cc, c))
1075 goto dead;
1076 }
1077 break;
1078 case I_REF:
1079 i = (int)(sub.sub[pc->n].ep - sub.sub[pc->n].sp);
1080 if (flags & REG_ICASE) {
1081 if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
1082 goto dead;
1083 } else {
1084 if (strncmp(sp, sub.sub[pc->n].sp, i))
1085 goto dead;
1086 }
1087 if (i > 0)
1088 sp += i;
1089 break;
1090
1091 case I_BOL:
1092 if (sp == bol && !(flags & REG_NOTBOL))
1093 break;
1094 if (flags & REG_NEWLINE)
1095 if (sp > bol && isnewline(sp[-1]))
1096 break;
1097 goto dead;
1098 case I_EOL:
1099 if (*sp == 0)
1100 break;
1101 if (flags & REG_NEWLINE)
1102 if (isnewline(*sp))
1103 break;
1104 goto dead;
1105 case I_WORD:
1106 i = sp > bol && iswordchar(sp[-1]);
1107 i ^= iswordchar(sp[0]);
1108 if (i)
1109 break;
1110 goto dead;
1111 case I_NWORD:
1112 i = sp > bol && iswordchar(sp[-1]);
1113 i ^= iswordchar(sp[0]);
1114 if (!i)
1115 break;
1116 goto dead;
1117
1118 case I_LPAR:
1119 sub.sub[pc->n].sp = sp;
1120 break;
1121 case I_RPAR:
1122 sub.sub[pc->n].ep = sp;
1123 break;
1124 default:
1125 goto dead;
1126 }
1127 pc = pc + 1;
1128 }
1129 dead: ;
1130 }
1131 return 0;
1132 }
1133
re_regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1134 int re_regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1135 {
1136 Resub scratch;
1137 int i;
1138
1139 if (!sub)
1140 sub = &scratch;
1141
1142 sub->nsub = prog->nsub;
1143 for (i = 0; i < MAXSUB; ++i)
1144 sub->sub[i].sp = sub->sub[i].ep = NULL;
1145
1146 return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1147 }
1148
1149 #ifdef TEST
main(int argc,char ** argv)1150 int main(int argc, char **argv)
1151 {
1152 const char *error;
1153 const char *s;
1154 Reprog *p;
1155 Resub m;
1156 unsigned int i;
1157
1158 if (argc > 1) {
1159 p = regcomp(argv[1], 0, &error);
1160 if (!p) {
1161 fprintf(stderr, "regcomp: %s\n", error);
1162 return 1;
1163 }
1164
1165 if (argc > 2) {
1166 s = argv[2];
1167 printf("nsub = %d\n", p->nsub);
1168 if (!regexec(p, s, &m, 0)) {
1169 for (i = 0; i < m.nsub; ++i) {
1170 int n = m.sub[i].ep - m.sub[i].sp;
1171 if (n > 0)
1172 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1173 else
1174 printf("match %d: n=0 ''\n", i);
1175 }
1176 } else {
1177 printf("no match\n");
1178 }
1179 }
1180 }
1181
1182 return 0;
1183 }
1184 #endif
1185