1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6
7 #include "regexp.h"
8 #include "utf.h"
9
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13
14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16 #define REPINF 255
17 #ifndef REG_MAXPROG
18 #define REG_MAXPROG (32 << 10)
19 #endif
20 #ifndef REG_MAXREC
21 #define REG_MAXREC 1024
22 #endif
23 #ifndef REG_MAXSPAN
24 #define REG_MAXSPAN 64
25 #endif
26 #ifndef REG_MAXCLASS
27 #define REG_MAXCLASS 16
28 #endif
29
30 typedef struct Reclass Reclass;
31 typedef struct Renode Renode;
32 typedef struct Reinst Reinst;
33 typedef struct Rethread Rethread;
34
35 struct Reclass {
36 Rune *end;
37 Rune spans[REG_MAXSPAN];
38 };
39
40 struct Reprog {
41 Reinst *start, *end;
42 int flags;
43 int nsub;
44 Reclass cclass[REG_MAXCLASS];
45 };
46
47 struct cstate {
48 Reprog *prog;
49 Renode *pstart, *pend;
50
51 const char *source;
52 int ncclass;
53 int nsub;
54 Renode *sub[REG_MAXSUB];
55
56 int lookahead;
57 Rune yychar;
58 Reclass *yycc;
59 int yymin, yymax;
60
61 const char *error;
62 jmp_buf kaboom;
63 };
64
die(struct cstate * g,const char * message)65 static void die(struct cstate *g, const char *message)
66 {
67 g->error = message;
68 longjmp(g->kaboom, 1);
69 }
70
canon(Rune c)71 static int canon(Rune c)
72 {
73 Rune u = toupperrune(c);
74 if (c >= 128 && u < 128)
75 return c;
76 return u;
77 }
78
79 /* Scan */
80
81 enum {
82 L_CHAR = 256,
83 L_CCLASS, /* character class */
84 L_NCCLASS, /* negative character class */
85 L_NC, /* "(?:" no capture */
86 L_PLA, /* "(?=" positive lookahead */
87 L_NLA, /* "(?!" negative lookahead */
88 L_WORD, /* "\b" word boundary */
89 L_NWORD, /* "\B" non-word boundary */
90 L_REF, /* "\1" back-reference */
91 L_COUNT, /* {M,N} */
92 };
93
hex(struct cstate * g,int c)94 static int hex(struct cstate *g, int c)
95 {
96 if (c >= '0' && c <= '9') return c - '0';
97 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
98 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
99 die(g, "invalid escape sequence");
100 return 0;
101 }
102
dec(struct cstate * g,int c)103 static int dec(struct cstate *g, int c)
104 {
105 if (c >= '0' && c <= '9') return c - '0';
106 die(g, "invalid quantifier");
107 return 0;
108 }
109
110 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
111
isunicodeletter(int c)112 static int isunicodeletter(int c)
113 {
114 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
115 }
116
nextrune(struct cstate * g)117 static int nextrune(struct cstate *g)
118 {
119 if (!*g->source) {
120 g->yychar = EOF;
121 return 0;
122 }
123 g->source += chartorune(&g->yychar, g->source);
124 if (g->yychar == '\\') {
125 if (!*g->source)
126 die(g, "unterminated escape sequence");
127 g->source += chartorune(&g->yychar, g->source);
128 switch (g->yychar) {
129 case 'f': g->yychar = '\f'; return 0;
130 case 'n': g->yychar = '\n'; return 0;
131 case 'r': g->yychar = '\r'; return 0;
132 case 't': g->yychar = '\t'; return 0;
133 case 'v': g->yychar = '\v'; return 0;
134 case 'c':
135 if (!g->source[0])
136 die(g, "unterminated escape sequence");
137 g->yychar = (*g->source++) & 31;
138 return 0;
139 case 'x':
140 if (!g->source[0] || !g->source[1])
141 die(g, "unterminated escape sequence");
142 g->yychar = hex(g, *g->source++) << 4;
143 g->yychar += hex(g, *g->source++);
144 if (g->yychar == 0) {
145 g->yychar = '0';
146 return 1;
147 }
148 return 0;
149 case 'u':
150 if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
151 die(g, "unterminated escape sequence");
152 g->yychar = hex(g, *g->source++) << 12;
153 g->yychar += hex(g, *g->source++) << 8;
154 g->yychar += hex(g, *g->source++) << 4;
155 g->yychar += hex(g, *g->source++);
156 if (g->yychar == 0) {
157 g->yychar = '0';
158 return 1;
159 }
160 return 0;
161 case 0:
162 g->yychar = '0';
163 return 1;
164 }
165 if (strchr(ESCAPES, g->yychar))
166 return 1;
167 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
168 die(g, "invalid escape character");
169 return 0;
170 }
171 return 0;
172 }
173
lexcount(struct cstate * g)174 static int lexcount(struct cstate *g)
175 {
176 g->yychar = *g->source++;
177
178 g->yymin = dec(g, g->yychar);
179 g->yychar = *g->source++;
180 while (g->yychar != ',' && g->yychar != '}') {
181 g->yymin = g->yymin * 10 + dec(g, g->yychar);
182 g->yychar = *g->source++;
183 if (g->yymin >= REPINF)
184 die(g, "numeric overflow");
185 }
186
187 if (g->yychar == ',') {
188 g->yychar = *g->source++;
189 if (g->yychar == '}') {
190 g->yymax = REPINF;
191 } else {
192 g->yymax = dec(g, g->yychar);
193 g->yychar = *g->source++;
194 while (g->yychar != '}') {
195 g->yymax = g->yymax * 10 + dec(g, g->yychar);
196 g->yychar = *g->source++;
197 if (g->yymax >= REPINF)
198 die(g, "numeric overflow");
199 }
200 }
201 } else {
202 g->yymax = g->yymin;
203 }
204
205 return L_COUNT;
206 }
207
newcclass(struct cstate * g)208 static void newcclass(struct cstate *g)
209 {
210 if (g->ncclass >= nelem(g->prog->cclass))
211 die(g, "too many character classes");
212 g->yycc = g->prog->cclass + g->ncclass++;
213 g->yycc->end = g->yycc->spans;
214 }
215
addrange(struct cstate * g,Rune a,Rune b)216 static void addrange(struct cstate *g, Rune a, Rune b)
217 {
218 if (a > b)
219 die(g, "invalid character class range");
220 if (g->yycc->end + 2 >= g->yycc->spans + nelem(g->yycc->spans))
221 die(g, "too many character class ranges");
222 *g->yycc->end++ = a;
223 *g->yycc->end++ = b;
224 }
225
addranges_d(struct cstate * g)226 static void addranges_d(struct cstate *g)
227 {
228 addrange(g, '0', '9');
229 }
230
addranges_D(struct cstate * g)231 static void addranges_D(struct cstate *g)
232 {
233 addrange(g, 0, '0'-1);
234 addrange(g, '9'+1, 0xFFFF);
235 }
236
addranges_s(struct cstate * g)237 static void addranges_s(struct cstate *g)
238 {
239 addrange(g, 0x9, 0xD);
240 addrange(g, 0x20, 0x20);
241 addrange(g, 0xA0, 0xA0);
242 addrange(g, 0x2028, 0x2029);
243 addrange(g, 0xFEFF, 0xFEFF);
244 }
245
addranges_S(struct cstate * g)246 static void addranges_S(struct cstate *g)
247 {
248 addrange(g, 0, 0x9-1);
249 addrange(g, 0xD+1, 0x20-1);
250 addrange(g, 0x20+1, 0xA0-1);
251 addrange(g, 0xA0+1, 0x2028-1);
252 addrange(g, 0x2029+1, 0xFEFF-1);
253 addrange(g, 0xFEFF+1, 0xFFFF);
254 }
255
addranges_w(struct cstate * g)256 static void addranges_w(struct cstate *g)
257 {
258 addrange(g, '0', '9');
259 addrange(g, 'A', 'Z');
260 addrange(g, '_', '_');
261 addrange(g, 'a', 'z');
262 }
263
addranges_W(struct cstate * g)264 static void addranges_W(struct cstate *g)
265 {
266 addrange(g, 0, '0'-1);
267 addrange(g, '9'+1, 'A'-1);
268 addrange(g, 'Z'+1, '_'-1);
269 addrange(g, '_'+1, 'a'-1);
270 addrange(g, 'z'+1, 0xFFFF);
271 }
272
lexclass(struct cstate * g)273 static int lexclass(struct cstate *g)
274 {
275 int type = L_CCLASS;
276 int quoted, havesave, havedash;
277 Rune save = 0;
278
279 newcclass(g);
280
281 quoted = nextrune(g);
282 if (!quoted && g->yychar == '^') {
283 type = L_NCCLASS;
284 quoted = nextrune(g);
285 }
286
287 havesave = havedash = 0;
288 for (;;) {
289 if (g->yychar == EOF)
290 die(g, "unterminated character class");
291 if (!quoted && g->yychar == ']')
292 break;
293
294 if (!quoted && g->yychar == '-') {
295 if (havesave) {
296 if (havedash) {
297 addrange(g, save, '-');
298 havesave = havedash = 0;
299 } else {
300 havedash = 1;
301 }
302 } else {
303 save = '-';
304 havesave = 1;
305 }
306 } else if (quoted && strchr("DSWdsw", g->yychar)) {
307 if (havesave) {
308 addrange(g, save, save);
309 if (havedash)
310 addrange(g, '-', '-');
311 }
312 switch (g->yychar) {
313 case 'd': addranges_d(g); break;
314 case 's': addranges_s(g); break;
315 case 'w': addranges_w(g); break;
316 case 'D': addranges_D(g); break;
317 case 'S': addranges_S(g); break;
318 case 'W': addranges_W(g); break;
319 }
320 havesave = havedash = 0;
321 } else {
322 if (quoted) {
323 if (g->yychar == 'b')
324 g->yychar = '\b';
325 else if (g->yychar == '0')
326 g->yychar = 0;
327 /* else identity escape */
328 }
329 if (havesave) {
330 if (havedash) {
331 addrange(g, save, g->yychar);
332 havesave = havedash = 0;
333 } else {
334 addrange(g, save, save);
335 save = g->yychar;
336 }
337 } else {
338 save = g->yychar;
339 havesave = 1;
340 }
341 }
342
343 quoted = nextrune(g);
344 }
345
346 if (havesave) {
347 addrange(g, save, save);
348 if (havedash)
349 addrange(g, '-', '-');
350 }
351
352 return type;
353 }
354
lex(struct cstate * g)355 static int lex(struct cstate *g)
356 {
357 int quoted = nextrune(g);
358 if (quoted) {
359 switch (g->yychar) {
360 case 'b': return L_WORD;
361 case 'B': return L_NWORD;
362 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
363 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
364 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
365 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
366 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
367 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
368 case '0': g->yychar = 0; return L_CHAR;
369 }
370 if (g->yychar >= '0' && g->yychar <= '9') {
371 g->yychar -= '0';
372 if (*g->source >= '0' && *g->source <= '9')
373 g->yychar = g->yychar * 10 + *g->source++ - '0';
374 return L_REF;
375 }
376 return L_CHAR;
377 }
378
379 switch (g->yychar) {
380 case EOF:
381 case '$': case ')': case '*': case '+':
382 case '.': case '?': case '^': case '|':
383 return g->yychar;
384 }
385
386 if (g->yychar == '{')
387 return lexcount(g);
388 if (g->yychar == '[')
389 return lexclass(g);
390 if (g->yychar == '(') {
391 if (g->source[0] == '?') {
392 if (g->source[1] == ':') {
393 g->source += 2;
394 return L_NC;
395 }
396 if (g->source[1] == '=') {
397 g->source += 2;
398 return L_PLA;
399 }
400 if (g->source[1] == '!') {
401 g->source += 2;
402 return L_NLA;
403 }
404 }
405 return '(';
406 }
407
408 return L_CHAR;
409 }
410
411 /* Parse */
412
413 enum {
414 P_CAT, P_ALT, P_REP,
415 P_BOL, P_EOL, P_WORD, P_NWORD,
416 P_PAR, P_PLA, P_NLA,
417 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
418 P_REF,
419 };
420
421 struct Renode {
422 unsigned char type;
423 unsigned char ng, m, n;
424 Rune c;
425 Reclass *cc;
426 Renode *x;
427 Renode *y;
428 };
429
newnode(struct cstate * g,int type)430 static Renode *newnode(struct cstate *g, int type)
431 {
432 Renode *node = g->pend++;
433 node->type = type;
434 node->cc = NULL;
435 node->c = 0;
436 node->ng = 0;
437 node->m = 0;
438 node->n = 0;
439 node->x = node->y = NULL;
440 return node;
441 }
442
empty(Renode * node)443 static int empty(Renode *node)
444 {
445 if (!node) return 1;
446 switch (node->type) {
447 default: return 1;
448 case P_CAT: return empty(node->x) && empty(node->y);
449 case P_ALT: return empty(node->x) || empty(node->y);
450 case P_REP: return empty(node->x) || node->m == 0;
451 case P_PAR: return empty(node->x);
452 case P_REF: return empty(node->x);
453 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
454 }
455 }
456
newrep(struct cstate * g,Renode * atom,int ng,int min,int max)457 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
458 {
459 Renode *rep = newnode(g, P_REP);
460 if (max == REPINF && empty(atom))
461 die(g, "infinite loop matching the empty string");
462 rep->ng = ng;
463 rep->m = min;
464 rep->n = max;
465 rep->x = atom;
466 return rep;
467 }
468
next(struct cstate * g)469 static void next(struct cstate *g)
470 {
471 g->lookahead = lex(g);
472 }
473
accept(struct cstate * g,int t)474 static int accept(struct cstate *g, int t)
475 {
476 if (g->lookahead == t) {
477 next(g);
478 return 1;
479 }
480 return 0;
481 }
482
483 static Renode *parsealt(struct cstate *g);
484
parseatom(struct cstate * g)485 static Renode *parseatom(struct cstate *g)
486 {
487 Renode *atom;
488 if (g->lookahead == L_CHAR) {
489 atom = newnode(g, P_CHAR);
490 atom->c = g->yychar;
491 next(g);
492 return atom;
493 }
494 if (g->lookahead == L_CCLASS) {
495 atom = newnode(g, P_CCLASS);
496 atom->cc = g->yycc;
497 next(g);
498 return atom;
499 }
500 if (g->lookahead == L_NCCLASS) {
501 atom = newnode(g, P_NCCLASS);
502 atom->cc = g->yycc;
503 next(g);
504 return atom;
505 }
506 if (g->lookahead == L_REF) {
507 atom = newnode(g, P_REF);
508 if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
509 die(g, "invalid back-reference");
510 atom->n = g->yychar;
511 atom->x = g->sub[g->yychar];
512 next(g);
513 return atom;
514 }
515 if (accept(g, '.'))
516 return newnode(g, P_ANY);
517 if (accept(g, '(')) {
518 atom = newnode(g, P_PAR);
519 if (g->nsub == REG_MAXSUB)
520 die(g, "too many captures");
521 atom->n = g->nsub++;
522 atom->x = parsealt(g);
523 g->sub[atom->n] = atom;
524 if (!accept(g, ')'))
525 die(g, "unmatched '('");
526 return atom;
527 }
528 if (accept(g, L_NC)) {
529 atom = parsealt(g);
530 if (!accept(g, ')'))
531 die(g, "unmatched '('");
532 return atom;
533 }
534 if (accept(g, L_PLA)) {
535 atom = newnode(g, P_PLA);
536 atom->x = parsealt(g);
537 if (!accept(g, ')'))
538 die(g, "unmatched '('");
539 return atom;
540 }
541 if (accept(g, L_NLA)) {
542 atom = newnode(g, P_NLA);
543 atom->x = parsealt(g);
544 if (!accept(g, ')'))
545 die(g, "unmatched '('");
546 return atom;
547 }
548 die(g, "syntax error");
549 return NULL;
550 }
551
parserep(struct cstate * g)552 static Renode *parserep(struct cstate *g)
553 {
554 Renode *atom;
555
556 if (accept(g, '^')) return newnode(g, P_BOL);
557 if (accept(g, '$')) return newnode(g, P_EOL);
558 if (accept(g, L_WORD)) return newnode(g, P_WORD);
559 if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
560
561 atom = parseatom(g);
562 if (g->lookahead == L_COUNT) {
563 int min = g->yymin, max = g->yymax;
564 next(g);
565 if (max < min)
566 die(g, "invalid quantifier");
567 return newrep(g, atom, accept(g, '?'), min, max);
568 }
569 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
570 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
571 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
572 return atom;
573 }
574
parsecat(struct cstate * g)575 static Renode *parsecat(struct cstate *g)
576 {
577 Renode *cat, *head, **tail;
578 if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
579 /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
580 head = parserep(g);
581 tail = &head;
582 while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
583 cat = newnode(g, P_CAT);
584 cat->x = *tail;
585 cat->y = parserep(g);
586 *tail = cat;
587 tail = &cat->y;
588 }
589 return head;
590 }
591 return NULL;
592 }
593
parsealt(struct cstate * g)594 static Renode *parsealt(struct cstate *g)
595 {
596 Renode *alt, *x;
597 alt = parsecat(g);
598 while (accept(g, '|')) {
599 x = alt;
600 alt = newnode(g, P_ALT);
601 alt->x = x;
602 alt->y = parsecat(g);
603 }
604 return alt;
605 }
606
607 /* Compile */
608
609 enum {
610 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
611 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
612 I_BOL, I_EOL, I_WORD, I_NWORD,
613 I_LPAR, I_RPAR
614 };
615
616 struct Reinst {
617 unsigned char opcode;
618 unsigned char n;
619 Rune c;
620 Reclass *cc;
621 Reinst *x;
622 Reinst *y;
623 };
624
count(struct cstate * g,Renode * node)625 static int count(struct cstate *g, Renode *node)
626 {
627 int min, max, n;
628 if (!node) return 0;
629 switch (node->type) {
630 default: return 1;
631 case P_CAT: return count(g, node->x) + count(g, node->y);
632 case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
633 case P_REP:
634 min = node->m;
635 max = node->n;
636 if (min == max) n = count(g, node->x) * min;
637 else if (max < REPINF) n = count(g, node->x) * max + (max - min);
638 else n = count(g, node->x) * (min + 1) + 2;
639 if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
640 return n;
641 case P_PAR: return count(g, node->x) + 2;
642 case P_PLA: return count(g, node->x) + 2;
643 case P_NLA: return count(g, node->x) + 2;
644 }
645 }
646
emit(Reprog * prog,int opcode)647 static Reinst *emit(Reprog *prog, int opcode)
648 {
649 Reinst *inst = prog->end++;
650 inst->opcode = opcode;
651 inst->n = 0;
652 inst->c = 0;
653 inst->cc = NULL;
654 inst->x = inst->y = NULL;
655 return inst;
656 }
657
compile(Reprog * prog,Renode * node)658 static void compile(Reprog *prog, Renode *node)
659 {
660 Reinst *inst, *split, *jump;
661 int i;
662
663 loop:
664 if (!node)
665 return;
666
667 switch (node->type) {
668 case P_CAT:
669 compile(prog, node->x);
670 node = node->y;
671 goto loop;
672
673 case P_ALT:
674 split = emit(prog, I_SPLIT);
675 compile(prog, node->x);
676 jump = emit(prog, I_JUMP);
677 compile(prog, node->y);
678 split->x = split + 1;
679 split->y = jump + 1;
680 jump->x = prog->end;
681 break;
682
683 case P_REP:
684 inst = NULL; /* silence compiler warning. assert(node->m > 0). */
685 for (i = 0; i < node->m; ++i) {
686 inst = prog->end;
687 compile(prog, node->x);
688 }
689 if (node->m == node->n)
690 break;
691 if (node->n < REPINF) {
692 for (i = node->m; i < node->n; ++i) {
693 split = emit(prog, I_SPLIT);
694 compile(prog, node->x);
695 if (node->ng) {
696 split->y = split + 1;
697 split->x = prog->end;
698 } else {
699 split->x = split + 1;
700 split->y = prog->end;
701 }
702 }
703 } else if (node->m == 0) {
704 split = emit(prog, I_SPLIT);
705 compile(prog, node->x);
706 jump = emit(prog, I_JUMP);
707 if (node->ng) {
708 split->y = split + 1;
709 split->x = prog->end;
710 } else {
711 split->x = split + 1;
712 split->y = prog->end;
713 }
714 jump->x = split;
715 } else {
716 split = emit(prog, I_SPLIT);
717 if (node->ng) {
718 split->y = inst;
719 split->x = prog->end;
720 } else {
721 split->x = inst;
722 split->y = prog->end;
723 }
724 }
725 break;
726
727 case P_BOL: emit(prog, I_BOL); break;
728 case P_EOL: emit(prog, I_EOL); break;
729 case P_WORD: emit(prog, I_WORD); break;
730 case P_NWORD: emit(prog, I_NWORD); break;
731
732 case P_PAR:
733 inst = emit(prog, I_LPAR);
734 inst->n = node->n;
735 compile(prog, node->x);
736 inst = emit(prog, I_RPAR);
737 inst->n = node->n;
738 break;
739 case P_PLA:
740 split = emit(prog, I_PLA);
741 compile(prog, node->x);
742 emit(prog, I_END);
743 split->x = split + 1;
744 split->y = prog->end;
745 break;
746 case P_NLA:
747 split = emit(prog, I_NLA);
748 compile(prog, node->x);
749 emit(prog, I_END);
750 split->x = split + 1;
751 split->y = prog->end;
752 break;
753
754 case P_ANY:
755 emit(prog, I_ANY);
756 break;
757 case P_CHAR:
758 inst = emit(prog, I_CHAR);
759 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
760 break;
761 case P_CCLASS:
762 inst = emit(prog, I_CCLASS);
763 inst->cc = node->cc;
764 break;
765 case P_NCCLASS:
766 inst = emit(prog, I_NCCLASS);
767 inst->cc = node->cc;
768 break;
769 case P_REF:
770 inst = emit(prog, I_REF);
771 inst->n = node->n;
772 break;
773 }
774 }
775
776 #ifdef TEST
dumpnode(Renode * node)777 static void dumpnode(Renode *node)
778 {
779 Rune *p;
780 if (!node) { printf("Empty"); return; }
781 switch (node->type) {
782 case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
783 case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
784 case P_REP:
785 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
786 dumpnode(node->x);
787 printf(")");
788 break;
789 case P_BOL: printf("Bol"); break;
790 case P_EOL: printf("Eol"); break;
791 case P_WORD: printf("Word"); break;
792 case P_NWORD: printf("NotWord"); break;
793 case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
794 case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
795 case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
796 case P_ANY: printf("Any"); break;
797 case P_CHAR: printf("Char(%c)", node->c); break;
798 case P_CCLASS:
799 printf("Class(");
800 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
801 printf(")");
802 break;
803 case P_NCCLASS:
804 printf("NotClass(");
805 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
806 printf(")");
807 break;
808 case P_REF: printf("Ref(%d)", node->n); break;
809 }
810 }
811
dumpprog(Reprog * prog)812 static void dumpprog(Reprog *prog)
813 {
814 Reinst *inst;
815 int i;
816 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
817 printf("% 5d: ", i);
818 switch (inst->opcode) {
819 case I_END: puts("end"); break;
820 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
821 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
822 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
823 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
824 case I_ANY: puts("any"); break;
825 case I_ANYNL: puts("anynl"); break;
826 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
827 case I_CCLASS: puts("cclass"); break;
828 case I_NCCLASS: puts("ncclass"); break;
829 case I_REF: printf("ref %d\n", inst->n); break;
830 case I_BOL: puts("bol"); break;
831 case I_EOL: puts("eol"); break;
832 case I_WORD: puts("word"); break;
833 case I_NWORD: puts("nword"); break;
834 case I_LPAR: printf("lpar %d\n", inst->n); break;
835 case I_RPAR: printf("rpar %d\n", inst->n); break;
836 }
837 }
838 }
839 #endif
840
regcompx(void * (* alloc)(void * ctx,void * p,int n),void * ctx,const char * pattern,int cflags,const char ** errorp)841 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
842 const char *pattern, int cflags, const char **errorp)
843 {
844 struct cstate g;
845 Renode *node;
846 Reinst *split, *jump;
847 int i, n;
848
849 g.pstart = NULL;
850 g.prog = NULL;
851
852 if (setjmp(g.kaboom)) {
853 if (errorp) *errorp = g.error;
854 alloc(ctx, g.pstart, 0);
855 alloc(ctx, g.prog, 0);
856 return NULL;
857 }
858
859 g.prog = alloc(ctx, NULL, sizeof (Reprog));
860 if (!g.prog)
861 die(&g, "cannot allocate regular expression");
862 n = strlen(pattern) * 2;
863 if (n > REG_MAXPROG)
864 die(&g, "program too large");
865 if (n > 0) {
866 g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
867 if (!g.pstart)
868 die(&g, "cannot allocate regular expression parse list");
869 }
870
871 g.source = pattern;
872 g.ncclass = 0;
873 g.nsub = 1;
874 for (i = 0; i < REG_MAXSUB; ++i)
875 g.sub[i] = 0;
876
877 g.prog->flags = cflags;
878
879 next(&g);
880 node = parsealt(&g);
881 if (g.lookahead == ')')
882 die(&g, "unmatched ')'");
883 if (g.lookahead != EOF)
884 die(&g, "syntax error");
885
886 #ifdef TEST
887 dumpnode(node);
888 putchar('\n');
889 #endif
890
891 n = 6 + count(&g, node);
892 if (n < 0 || n > REG_MAXPROG)
893 die(&g, "program too large");
894
895 g.prog->nsub = g.nsub;
896 g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
897 if (!g.prog->start)
898 die(&g, "cannot allocate regular expression instruction list");
899
900 split = emit(g.prog, I_SPLIT);
901 split->x = split + 3;
902 split->y = split + 1;
903 emit(g.prog, I_ANYNL);
904 jump = emit(g.prog, I_JUMP);
905 jump->x = split;
906 emit(g.prog, I_LPAR);
907 compile(g.prog, node);
908 emit(g.prog, I_RPAR);
909 emit(g.prog, I_END);
910
911 #ifdef TEST
912 dumpprog(g.prog);
913 #endif
914
915 alloc(ctx, g.pstart, 0);
916
917 if (errorp) *errorp = NULL;
918 return g.prog;
919 }
920
regfreex(void * (* alloc)(void * ctx,void * p,int n),void * ctx,Reprog * prog)921 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
922 {
923 if (prog) {
924 alloc(ctx, prog->start, 0);
925 alloc(ctx, prog, 0);
926 }
927 }
928
default_alloc(void * ctx,void * p,int n)929 static void *default_alloc(void *ctx, void *p, int n)
930 {
931 return realloc(p, (size_t)n);
932 }
933
regcomp(const char * pattern,int cflags,const char ** errorp)934 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
935 {
936 return regcompx(default_alloc, NULL, pattern, cflags, errorp);
937 }
938
regfree(Reprog * prog)939 void regfree(Reprog *prog)
940 {
941 regfreex(default_alloc, NULL, prog);
942 }
943
944 /* Match */
945
isnewline(int c)946 static int isnewline(int c)
947 {
948 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
949 }
950
iswordchar(int c)951 static int iswordchar(int c)
952 {
953 return c == '_' ||
954 (c >= 'a' && c <= 'z') ||
955 (c >= 'A' && c <= 'Z') ||
956 (c >= '0' && c <= '9');
957 }
958
incclass(Reclass * cc,Rune c)959 static int incclass(Reclass *cc, Rune c)
960 {
961 Rune *p;
962 for (p = cc->spans; p < cc->end; p += 2)
963 if (p[0] <= c && c <= p[1])
964 return 1;
965 return 0;
966 }
967
incclasscanon(Reclass * cc,Rune c)968 static int incclasscanon(Reclass *cc, Rune c)
969 {
970 Rune *p, r;
971 for (p = cc->spans; p < cc->end; p += 2)
972 for (r = p[0]; r <= p[1]; ++r)
973 if (c == canon(r))
974 return 1;
975 return 0;
976 }
977
strncmpcanon(const char * a,const char * b,int n)978 static int strncmpcanon(const char *a, const char *b, int n)
979 {
980 Rune ra, rb;
981 int c;
982 while (n--) {
983 if (!*a) return -1;
984 if (!*b) return 1;
985 a += chartorune(&ra, a);
986 b += chartorune(&rb, b);
987 c = canon(ra) - canon(rb);
988 if (c)
989 return c;
990 }
991 return 0;
992 }
993
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out,int depth)994 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
995 {
996 Resub scratch;
997 int result;
998 int i;
999 Rune c;
1000
1001 /* stack overflow */
1002 if (depth > REG_MAXREC)
1003 return -1;
1004
1005 for (;;) {
1006 switch (pc->opcode) {
1007 case I_END:
1008 return 0;
1009 case I_JUMP:
1010 pc = pc->x;
1011 break;
1012 case I_SPLIT:
1013 scratch = *out;
1014 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1015 if (result == -1)
1016 return -1;
1017 if (result == 0) {
1018 *out = scratch;
1019 return 0;
1020 }
1021 pc = pc->y;
1022 break;
1023
1024 case I_PLA:
1025 result = match(pc->x, sp, bol, flags, out, depth+1);
1026 if (result == -1)
1027 return -1;
1028 if (result == 1)
1029 return 1;
1030 pc = pc->y;
1031 break;
1032 case I_NLA:
1033 scratch = *out;
1034 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1035 if (result == -1)
1036 return -1;
1037 if (result == 0)
1038 return 1;
1039 pc = pc->y;
1040 break;
1041
1042 case I_ANYNL:
1043 if (!*sp) return 1;
1044 sp += chartorune(&c, sp);
1045 pc = pc + 1;
1046 break;
1047 case I_ANY:
1048 if (!*sp) return 1;
1049 sp += chartorune(&c, sp);
1050 if (isnewline(c))
1051 return 1;
1052 pc = pc + 1;
1053 break;
1054 case I_CHAR:
1055 if (!*sp) return 1;
1056 sp += chartorune(&c, sp);
1057 if (flags & REG_ICASE)
1058 c = canon(c);
1059 if (c != pc->c)
1060 return 1;
1061 pc = pc + 1;
1062 break;
1063 case I_CCLASS:
1064 if (!*sp) return 1;
1065 sp += chartorune(&c, sp);
1066 if (flags & REG_ICASE) {
1067 if (!incclasscanon(pc->cc, canon(c)))
1068 return 1;
1069 } else {
1070 if (!incclass(pc->cc, c))
1071 return 1;
1072 }
1073 pc = pc + 1;
1074 break;
1075 case I_NCCLASS:
1076 if (!*sp) return 1;
1077 sp += chartorune(&c, sp);
1078 if (flags & REG_ICASE) {
1079 if (incclasscanon(pc->cc, canon(c)))
1080 return 1;
1081 } else {
1082 if (incclass(pc->cc, c))
1083 return 1;
1084 }
1085 pc = pc + 1;
1086 break;
1087 case I_REF:
1088 i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1089 if (flags & REG_ICASE) {
1090 if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1091 return 1;
1092 } else {
1093 if (strncmp(sp, out->sub[pc->n].sp, i))
1094 return 1;
1095 }
1096 if (i > 0)
1097 sp += i;
1098 pc = pc + 1;
1099 break;
1100
1101 case I_BOL:
1102 if (sp == bol && !(flags & REG_NOTBOL)) {
1103 pc = pc + 1;
1104 break;
1105 }
1106 if (flags & REG_NEWLINE) {
1107 if (sp > bol && isnewline(sp[-1])) {
1108 pc = pc + 1;
1109 break;
1110 }
1111 }
1112 return 1;
1113 case I_EOL:
1114 if (*sp == 0) {
1115 pc = pc + 1;
1116 break;
1117 }
1118 if (flags & REG_NEWLINE) {
1119 if (isnewline(*sp)) {
1120 pc = pc + 1;
1121 break;
1122 }
1123 }
1124 return 1;
1125 case I_WORD:
1126 i = sp > bol && iswordchar(sp[-1]);
1127 i ^= iswordchar(sp[0]);
1128 if (!i)
1129 return 1;
1130 pc = pc + 1;
1131 break;
1132 case I_NWORD:
1133 i = sp > bol && iswordchar(sp[-1]);
1134 i ^= iswordchar(sp[0]);
1135 if (i)
1136 return 1;
1137 pc = pc + 1;
1138 break;
1139
1140 case I_LPAR:
1141 out->sub[pc->n].sp = sp;
1142 pc = pc + 1;
1143 break;
1144 case I_RPAR:
1145 out->sub[pc->n].ep = sp;
1146 pc = pc + 1;
1147 break;
1148 default:
1149 return 1;
1150 }
1151 }
1152 }
1153
regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1154 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1155 {
1156 Resub scratch;
1157 int i;
1158
1159 if (!sub)
1160 sub = &scratch;
1161
1162 sub->nsub = prog->nsub;
1163 for (i = 0; i < REG_MAXSUB; ++i)
1164 sub->sub[i].sp = sub->sub[i].ep = NULL;
1165
1166 return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1167 }
1168
1169 #ifdef TEST
main(int argc,char ** argv)1170 int main(int argc, char **argv)
1171 {
1172 const char *error;
1173 const char *s;
1174 Reprog *p;
1175 Resub m;
1176 int i;
1177
1178 if (argc > 1) {
1179 p = regcomp(argv[1], 0, &error);
1180 if (!p) {
1181 fprintf(stderr, "regcomp: %s\n", error);
1182 return 1;
1183 }
1184
1185 if (argc > 2) {
1186 s = argv[2];
1187 printf("nsub = %d\n", p->nsub);
1188 if (!regexec(p, s, &m, 0)) {
1189 for (i = 0; i < m.nsub; ++i) {
1190 int n = m.sub[i].ep - m.sub[i].sp;
1191 if (n > 0)
1192 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1193 else
1194 printf("match %d: n=0 ''\n", i);
1195 }
1196 } else {
1197 printf("no match\n");
1198 }
1199 }
1200 }
1201
1202 return 0;
1203 }
1204 #endif
1205