1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6
7 #include "regexp.h"
8 #include "utf.h"
9
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13
14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16 #define REPINF 255
17 #define MAXSUB REG_MAXSUB
18 #define MAXPROG (32 << 10)
19
20 typedef struct Reclass Reclass;
21 typedef struct Renode Renode;
22 typedef struct Reinst Reinst;
23 typedef struct Rethread Rethread;
24
25 struct Reclass {
26 Rune *end;
27 Rune spans[64];
28 };
29
30 struct Reprog {
31 Reinst *start, *end;
32 int flags;
33 int nsub;
34 Reclass cclass[16];
35 };
36
37 struct cstate {
38 Reprog *prog;
39 Renode *pstart, *pend;
40
41 const char *source;
42 int ncclass;
43 int nsub;
44 Renode *sub[MAXSUB];
45
46 int lookahead;
47 Rune yychar;
48 Reclass *yycc;
49 int yymin, yymax;
50
51 const char *error;
52 jmp_buf kaboom;
53 };
54
die(struct cstate * g,const char * message)55 static void die(struct cstate *g, const char *message)
56 {
57 g->error = message;
58 longjmp(g->kaboom, 1);
59 }
60
canon(Rune c)61 static int canon(Rune c)
62 {
63 Rune u = toupperrune(c);
64 if (c >= 128 && u < 128)
65 return c;
66 return u;
67 }
68
69 /* Scan */
70
71 enum {
72 L_CHAR = 256,
73 L_CCLASS, /* character class */
74 L_NCCLASS, /* negative character class */
75 L_NC, /* "(?:" no capture */
76 L_PLA, /* "(?=" positive lookahead */
77 L_NLA, /* "(?!" negative lookahead */
78 L_WORD, /* "\b" word boundary */
79 L_NWORD, /* "\B" non-word boundary */
80 L_REF, /* "\1" back-reference */
81 L_COUNT, /* {M,N} */
82 };
83
hex(struct cstate * g,int c)84 static int hex(struct cstate *g, int c)
85 {
86 if (c >= '0' && c <= '9') return c - '0';
87 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
88 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
89 die(g, "invalid escape sequence");
90 return 0;
91 }
92
dec(struct cstate * g,int c)93 static int dec(struct cstate *g, int c)
94 {
95 if (c >= '0' && c <= '9') return c - '0';
96 die(g, "invalid quantifier");
97 return 0;
98 }
99
100 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
101
isunicodeletter(int c)102 static int isunicodeletter(int c)
103 {
104 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
105 }
106
nextrune(struct cstate * g)107 static int nextrune(struct cstate *g)
108 {
109 g->source += chartorune(&g->yychar, g->source);
110 if (g->yychar == '\\') {
111 g->source += chartorune(&g->yychar, g->source);
112 switch (g->yychar) {
113 case 0: die(g, "unterminated escape sequence");
114 case 'f': g->yychar = '\f'; return 0;
115 case 'n': g->yychar = '\n'; return 0;
116 case 'r': g->yychar = '\r'; return 0;
117 case 't': g->yychar = '\t'; return 0;
118 case 'v': g->yychar = '\v'; return 0;
119 case 'c':
120 g->yychar = (*g->source++) & 31;
121 return 0;
122 case 'x':
123 g->yychar = hex(g, *g->source++) << 4;
124 g->yychar += hex(g, *g->source++);
125 if (g->yychar == 0) {
126 g->yychar = '0';
127 return 1;
128 }
129 return 0;
130 case 'u':
131 g->yychar = hex(g, *g->source++) << 12;
132 g->yychar += hex(g, *g->source++) << 8;
133 g->yychar += hex(g, *g->source++) << 4;
134 g->yychar += hex(g, *g->source++);
135 if (g->yychar == 0) {
136 g->yychar = '0';
137 return 1;
138 }
139 return 0;
140 }
141 if (strchr(ESCAPES, g->yychar))
142 return 1;
143 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
144 die(g, "invalid escape character");
145 return 0;
146 }
147 return 0;
148 }
149
lexcount(struct cstate * g)150 static int lexcount(struct cstate *g)
151 {
152 g->yychar = *g->source++;
153
154 g->yymin = dec(g, g->yychar);
155 g->yychar = *g->source++;
156 while (g->yychar != ',' && g->yychar != '}') {
157 g->yymin = g->yymin * 10 + dec(g, g->yychar);
158 g->yychar = *g->source++;
159 }
160 if (g->yymin < 0 || g->yymin >= REPINF)
161 die(g, "numeric overflow");
162
163 if (g->yychar == ',') {
164 g->yychar = *g->source++;
165 if (g->yychar == '}') {
166 g->yymax = REPINF;
167 } else {
168 g->yymax = dec(g, g->yychar);
169 g->yychar = *g->source++;
170 while (g->yychar != '}') {
171 g->yymax = g->yymax * 10 + dec(g, g->yychar);
172 g->yychar = *g->source++;
173 }
174 if (g->yymax < 0 || g->yymax >= REPINF)
175 die(g, "numeric overflow");
176 }
177 } else {
178 g->yymax = g->yymin;
179 }
180
181 return L_COUNT;
182 }
183
newcclass(struct cstate * g)184 static void newcclass(struct cstate *g)
185 {
186 if (g->ncclass >= nelem(g->prog->cclass))
187 die(g, "too many character classes");
188 g->yycc = g->prog->cclass + g->ncclass++;
189 g->yycc->end = g->yycc->spans;
190 }
191
addrange(struct cstate * g,Rune a,Rune b)192 static void addrange(struct cstate *g, Rune a, Rune b)
193 {
194 if (a > b)
195 die(g, "invalid character class range");
196 if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
197 die(g, "too many character class ranges");
198 *g->yycc->end++ = a;
199 *g->yycc->end++ = b;
200 }
201
addranges_d(struct cstate * g)202 static void addranges_d(struct cstate *g)
203 {
204 addrange(g, '0', '9');
205 }
206
addranges_D(struct cstate * g)207 static void addranges_D(struct cstate *g)
208 {
209 addrange(g, 0, '0'-1);
210 addrange(g, '9'+1, 0xFFFF);
211 }
212
addranges_s(struct cstate * g)213 static void addranges_s(struct cstate *g)
214 {
215 addrange(g, 0x9, 0xD);
216 addrange(g, 0x20, 0x20);
217 addrange(g, 0xA0, 0xA0);
218 addrange(g, 0x2028, 0x2029);
219 addrange(g, 0xFEFF, 0xFEFF);
220 }
221
addranges_S(struct cstate * g)222 static void addranges_S(struct cstate *g)
223 {
224 addrange(g, 0, 0x9-1);
225 addrange(g, 0xD+1, 0x20-1);
226 addrange(g, 0x20+1, 0xA0-1);
227 addrange(g, 0xA0+1, 0x2028-1);
228 addrange(g, 0x2029+1, 0xFEFF-1);
229 addrange(g, 0xFEFF+1, 0xFFFF);
230 }
231
addranges_w(struct cstate * g)232 static void addranges_w(struct cstate *g)
233 {
234 addrange(g, '0', '9');
235 addrange(g, 'A', 'Z');
236 addrange(g, '_', '_');
237 addrange(g, 'a', 'z');
238 }
239
addranges_W(struct cstate * g)240 static void addranges_W(struct cstate *g)
241 {
242 addrange(g, 0, '0'-1);
243 addrange(g, '9'+1, 'A'-1);
244 addrange(g, 'Z'+1, '_'-1);
245 addrange(g, '_'+1, 'a'-1);
246 addrange(g, 'z'+1, 0xFFFF);
247 }
248
lexclass(struct cstate * g)249 static int lexclass(struct cstate *g)
250 {
251 int type = L_CCLASS;
252 int quoted, havesave, havedash;
253 Rune save = 0;
254
255 newcclass(g);
256
257 quoted = nextrune(g);
258 if (!quoted && g->yychar == '^') {
259 type = L_NCCLASS;
260 quoted = nextrune(g);
261 }
262
263 havesave = havedash = 0;
264 for (;;) {
265 if (g->yychar == 0)
266 die(g, "unterminated character class");
267 if (!quoted && g->yychar == ']')
268 break;
269
270 if (!quoted && g->yychar == '-') {
271 if (havesave) {
272 if (havedash) {
273 addrange(g, save, '-');
274 havesave = havedash = 0;
275 } else {
276 havedash = 1;
277 }
278 } else {
279 save = '-';
280 havesave = 1;
281 }
282 } else if (quoted && strchr("DSWdsw", g->yychar)) {
283 if (havesave) {
284 addrange(g, save, save);
285 if (havedash)
286 addrange(g, '-', '-');
287 }
288 switch (g->yychar) {
289 case 'd': addranges_d(g); break;
290 case 's': addranges_s(g); break;
291 case 'w': addranges_w(g); break;
292 case 'D': addranges_D(g); break;
293 case 'S': addranges_S(g); break;
294 case 'W': addranges_W(g); break;
295 }
296 havesave = havedash = 0;
297 } else {
298 if (quoted) {
299 if (g->yychar == 'b')
300 g->yychar = '\b';
301 else if (g->yychar == '0')
302 g->yychar = 0;
303 /* else identity escape */
304 }
305 if (havesave) {
306 if (havedash) {
307 addrange(g, save, g->yychar);
308 havesave = havedash = 0;
309 } else {
310 addrange(g, save, save);
311 save = g->yychar;
312 }
313 } else {
314 save = g->yychar;
315 havesave = 1;
316 }
317 }
318
319 quoted = nextrune(g);
320 }
321
322 if (havesave) {
323 addrange(g, save, save);
324 if (havedash)
325 addrange(g, '-', '-');
326 }
327
328 return type;
329 }
330
lex(struct cstate * g)331 static int lex(struct cstate *g)
332 {
333 int quoted = nextrune(g);
334 if (quoted) {
335 switch (g->yychar) {
336 case 'b': return L_WORD;
337 case 'B': return L_NWORD;
338 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
339 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
340 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
341 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
342 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
343 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
344 case '0': g->yychar = 0; return L_CHAR;
345 }
346 if (g->yychar >= '0' && g->yychar <= '9') {
347 g->yychar -= '0';
348 if (*g->source >= '0' && *g->source <= '9')
349 g->yychar = g->yychar * 10 + *g->source++ - '0';
350 return L_REF;
351 }
352 return L_CHAR;
353 }
354
355 switch (g->yychar) {
356 case 0:
357 case '$': case ')': case '*': case '+':
358 case '.': case '?': case '^': case '|':
359 return g->yychar;
360 }
361
362 if (g->yychar == '{')
363 return lexcount(g);
364 if (g->yychar == '[')
365 return lexclass(g);
366 if (g->yychar == '(') {
367 if (g->source[0] == '?') {
368 if (g->source[1] == ':') {
369 g->source += 2;
370 return L_NC;
371 }
372 if (g->source[1] == '=') {
373 g->source += 2;
374 return L_PLA;
375 }
376 if (g->source[1] == '!') {
377 g->source += 2;
378 return L_NLA;
379 }
380 }
381 return '(';
382 }
383
384 return L_CHAR;
385 }
386
387 /* Parse */
388
389 enum {
390 P_CAT, P_ALT, P_REP,
391 P_BOL, P_EOL, P_WORD, P_NWORD,
392 P_PAR, P_PLA, P_NLA,
393 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
394 P_REF,
395 };
396
397 struct Renode {
398 unsigned char type;
399 unsigned char ng, m, n;
400 Rune c;
401 Reclass *cc;
402 Renode *x;
403 Renode *y;
404 };
405
newnode(struct cstate * g,int type)406 static Renode *newnode(struct cstate *g, int type)
407 {
408 Renode *node = g->pend++;
409 node->type = type;
410 node->cc = NULL;
411 node->c = 0;
412 node->ng = 0;
413 node->m = 0;
414 node->n = 0;
415 node->x = node->y = NULL;
416 return node;
417 }
418
empty(Renode * node)419 static int empty(Renode *node)
420 {
421 if (!node) return 1;
422 switch (node->type) {
423 default: return 1;
424 case P_CAT: return empty(node->x) && empty(node->y);
425 case P_ALT: return empty(node->x) || empty(node->y);
426 case P_REP: return empty(node->x) || node->m == 0;
427 case P_PAR: return empty(node->x);
428 case P_REF: return empty(node->x);
429 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
430 }
431 }
432
newrep(struct cstate * g,Renode * atom,int ng,int min,int max)433 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
434 {
435 Renode *rep = newnode(g, P_REP);
436 if (max == REPINF && empty(atom))
437 die(g, "infinite loop matching the empty string");
438 rep->ng = ng;
439 rep->m = min;
440 rep->n = max;
441 rep->x = atom;
442 return rep;
443 }
444
next(struct cstate * g)445 static void next(struct cstate *g)
446 {
447 g->lookahead = lex(g);
448 }
449
accept(struct cstate * g,int t)450 static int accept(struct cstate *g, int t)
451 {
452 if (g->lookahead == t) {
453 next(g);
454 return 1;
455 }
456 return 0;
457 }
458
459 static Renode *parsealt(struct cstate *g);
460
parseatom(struct cstate * g)461 static Renode *parseatom(struct cstate *g)
462 {
463 Renode *atom;
464 if (g->lookahead == L_CHAR) {
465 atom = newnode(g, P_CHAR);
466 atom->c = g->yychar;
467 next(g);
468 return atom;
469 }
470 if (g->lookahead == L_CCLASS) {
471 atom = newnode(g, P_CCLASS);
472 atom->cc = g->yycc;
473 next(g);
474 return atom;
475 }
476 if (g->lookahead == L_NCCLASS) {
477 atom = newnode(g, P_NCCLASS);
478 atom->cc = g->yycc;
479 next(g);
480 return atom;
481 }
482 if (g->lookahead == L_REF) {
483 atom = newnode(g, P_REF);
484 if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
485 die(g, "invalid back-reference");
486 atom->n = g->yychar;
487 atom->x = g->sub[g->yychar];
488 next(g);
489 return atom;
490 }
491 if (accept(g, '.'))
492 return newnode(g, P_ANY);
493 if (accept(g, '(')) {
494 atom = newnode(g, P_PAR);
495 if (g->nsub == MAXSUB)
496 die(g, "too many captures");
497 atom->n = g->nsub++;
498 atom->x = parsealt(g);
499 g->sub[atom->n] = atom;
500 if (!accept(g, ')'))
501 die(g, "unmatched '('");
502 return atom;
503 }
504 if (accept(g, L_NC)) {
505 atom = parsealt(g);
506 if (!accept(g, ')'))
507 die(g, "unmatched '('");
508 return atom;
509 }
510 if (accept(g, L_PLA)) {
511 atom = newnode(g, P_PLA);
512 atom->x = parsealt(g);
513 if (!accept(g, ')'))
514 die(g, "unmatched '('");
515 return atom;
516 }
517 if (accept(g, L_NLA)) {
518 atom = newnode(g, P_NLA);
519 atom->x = parsealt(g);
520 if (!accept(g, ')'))
521 die(g, "unmatched '('");
522 return atom;
523 }
524 die(g, "syntax error");
525 return NULL;
526 }
527
parserep(struct cstate * g)528 static Renode *parserep(struct cstate *g)
529 {
530 Renode *atom;
531
532 if (accept(g, '^')) return newnode(g, P_BOL);
533 if (accept(g, '$')) return newnode(g, P_EOL);
534 if (accept(g, L_WORD)) return newnode(g, P_WORD);
535 if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
536
537 atom = parseatom(g);
538 if (g->lookahead == L_COUNT) {
539 int min = g->yymin, max = g->yymax;
540 next(g);
541 if (max < min)
542 die(g, "invalid quantifier");
543 return newrep(g, atom, accept(g, '?'), min, max);
544 }
545 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
546 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
547 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
548 return atom;
549 }
550
parsecat(struct cstate * g)551 static Renode *parsecat(struct cstate *g)
552 {
553 Renode *cat, *x;
554 if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
555 cat = parserep(g);
556 while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
557 x = cat;
558 cat = newnode(g, P_CAT);
559 cat->x = x;
560 cat->y = parserep(g);
561 }
562 return cat;
563 }
564 return NULL;
565 }
566
parsealt(struct cstate * g)567 static Renode *parsealt(struct cstate *g)
568 {
569 Renode *alt, *x;
570 alt = parsecat(g);
571 while (accept(g, '|')) {
572 x = alt;
573 alt = newnode(g, P_ALT);
574 alt->x = x;
575 alt->y = parsecat(g);
576 }
577 return alt;
578 }
579
580 /* Compile */
581
582 enum {
583 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
584 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
585 I_BOL, I_EOL, I_WORD, I_NWORD,
586 I_LPAR, I_RPAR
587 };
588
589 struct Reinst {
590 unsigned char opcode;
591 unsigned char n;
592 Rune c;
593 Reclass *cc;
594 Reinst *x;
595 Reinst *y;
596 };
597
count(struct cstate * g,Renode * node)598 static int count(struct cstate *g, Renode *node)
599 {
600 int min, max, n;
601 if (!node) return 0;
602 switch (node->type) {
603 default: return 1;
604 case P_CAT: return count(g, node->x) + count(g, node->y);
605 case P_ALT: return count(g, node->x) + count(g, node->y) + 2;
606 case P_REP:
607 min = node->m;
608 max = node->n;
609 if (min == max) n = count(g, node->x) * min;
610 else if (max < REPINF) n = count(g, node->x) * max + (max - min);
611 else n = count(g, node->x) * (min + 1) + 2;
612 if (n < 0 || n > MAXPROG) die(g, "program too large");
613 return n;
614 case P_PAR: return count(g, node->x) + 2;
615 case P_PLA: return count(g, node->x) + 2;
616 case P_NLA: return count(g, node->x) + 2;
617 }
618 }
619
emit(Reprog * prog,int opcode)620 static Reinst *emit(Reprog *prog, int opcode)
621 {
622 Reinst *inst = prog->end++;
623 inst->opcode = opcode;
624 inst->n = 0;
625 inst->c = 0;
626 inst->cc = NULL;
627 inst->x = inst->y = NULL;
628 return inst;
629 }
630
compile(Reprog * prog,Renode * node)631 static void compile(Reprog *prog, Renode *node)
632 {
633 Reinst *inst, *split, *jump;
634 int i;
635
636 if (!node)
637 return;
638
639 switch (node->type) {
640 case P_CAT:
641 compile(prog, node->x);
642 compile(prog, node->y);
643 break;
644
645 case P_ALT:
646 split = emit(prog, I_SPLIT);
647 compile(prog, node->x);
648 jump = emit(prog, I_JUMP);
649 compile(prog, node->y);
650 split->x = split + 1;
651 split->y = jump + 1;
652 jump->x = prog->end;
653 break;
654
655 case P_REP:
656 inst = NULL; /* silence compiler warning. assert(node->m > 0). */
657 for (i = 0; i < node->m; ++i) {
658 inst = prog->end;
659 compile(prog, node->x);
660 }
661 if (node->m == node->n)
662 break;
663 if (node->n < REPINF) {
664 for (i = node->m; i < node->n; ++i) {
665 split = emit(prog, I_SPLIT);
666 compile(prog, node->x);
667 if (node->ng) {
668 split->y = split + 1;
669 split->x = prog->end;
670 } else {
671 split->x = split + 1;
672 split->y = prog->end;
673 }
674 }
675 } else if (node->m == 0) {
676 split = emit(prog, I_SPLIT);
677 compile(prog, node->x);
678 jump = emit(prog, I_JUMP);
679 if (node->ng) {
680 split->y = split + 1;
681 split->x = prog->end;
682 } else {
683 split->x = split + 1;
684 split->y = prog->end;
685 }
686 jump->x = split;
687 } else {
688 split = emit(prog, I_SPLIT);
689 if (node->ng) {
690 split->y = inst;
691 split->x = prog->end;
692 } else {
693 split->x = inst;
694 split->y = prog->end;
695 }
696 }
697 break;
698
699 case P_BOL: emit(prog, I_BOL); break;
700 case P_EOL: emit(prog, I_EOL); break;
701 case P_WORD: emit(prog, I_WORD); break;
702 case P_NWORD: emit(prog, I_NWORD); break;
703
704 case P_PAR:
705 inst = emit(prog, I_LPAR);
706 inst->n = node->n;
707 compile(prog, node->x);
708 inst = emit(prog, I_RPAR);
709 inst->n = node->n;
710 break;
711 case P_PLA:
712 split = emit(prog, I_PLA);
713 compile(prog, node->x);
714 emit(prog, I_END);
715 split->x = split + 1;
716 split->y = prog->end;
717 break;
718 case P_NLA:
719 split = emit(prog, I_NLA);
720 compile(prog, node->x);
721 emit(prog, I_END);
722 split->x = split + 1;
723 split->y = prog->end;
724 break;
725
726 case P_ANY:
727 emit(prog, I_ANY);
728 break;
729 case P_CHAR:
730 inst = emit(prog, I_CHAR);
731 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
732 break;
733 case P_CCLASS:
734 inst = emit(prog, I_CCLASS);
735 inst->cc = node->cc;
736 break;
737 case P_NCCLASS:
738 inst = emit(prog, I_NCCLASS);
739 inst->cc = node->cc;
740 break;
741 case P_REF:
742 inst = emit(prog, I_REF);
743 inst->n = node->n;
744 break;
745 }
746 }
747
748 #ifdef TEST
dumpnode(Renode * node)749 static void dumpnode(Renode *node)
750 {
751 Rune *p;
752 if (!node) { printf("Empty"); return; }
753 switch (node->type) {
754 case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
755 case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
756 case P_REP:
757 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
758 dumpnode(node->x);
759 printf(")");
760 break;
761 case P_BOL: printf("Bol"); break;
762 case P_EOL: printf("Eol"); break;
763 case P_WORD: printf("Word"); break;
764 case P_NWORD: printf("NotWord"); break;
765 case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
766 case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
767 case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
768 case P_ANY: printf("Any"); break;
769 case P_CHAR: printf("Char(%c)", node->c); break;
770 case P_CCLASS:
771 printf("Class(");
772 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
773 printf(")");
774 break;
775 case P_NCCLASS:
776 printf("NotClass(");
777 for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
778 printf(")");
779 break;
780 case P_REF: printf("Ref(%d)", node->n); break;
781 }
782 }
783
dumpprog(Reprog * prog)784 static void dumpprog(Reprog *prog)
785 {
786 Reinst *inst;
787 int i;
788 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
789 printf("% 5d: ", i);
790 switch (inst->opcode) {
791 case I_END: puts("end"); break;
792 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
793 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
794 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
795 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
796 case I_ANY: puts("any"); break;
797 case I_ANYNL: puts("anynl"); break;
798 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
799 case I_CCLASS: puts("cclass"); break;
800 case I_NCCLASS: puts("ncclass"); break;
801 case I_REF: printf("ref %d\n", inst->n); break;
802 case I_BOL: puts("bol"); break;
803 case I_EOL: puts("eol"); break;
804 case I_WORD: puts("word"); break;
805 case I_NWORD: puts("nword"); break;
806 case I_LPAR: printf("lpar %d\n", inst->n); break;
807 case I_RPAR: printf("rpar %d\n", inst->n); break;
808 }
809 }
810 }
811 #endif
812
regcompx(void * (* alloc)(void * ctx,void * p,int n),void * ctx,const char * pattern,int cflags,const char ** errorp)813 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
814 const char *pattern, int cflags, const char **errorp)
815 {
816 struct cstate g;
817 Renode *node;
818 Reinst *split, *jump;
819 int i, n;
820
821 g.pstart = NULL;
822 g.prog = NULL;
823
824 if (setjmp(g.kaboom)) {
825 if (errorp) *errorp = g.error;
826 alloc(ctx, g.pstart, 0);
827 alloc(ctx, g.prog, 0);
828 return NULL;
829 }
830
831 g.prog = alloc(ctx, NULL, sizeof (Reprog));
832 if (!g.prog)
833 die(&g, "cannot allocate regular expression");
834 n = strlen(pattern) * 2;
835 if (n > 0) {
836 g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
837 if (!g.pstart)
838 die(&g, "cannot allocate regular expression parse list");
839 }
840
841 g.source = pattern;
842 g.ncclass = 0;
843 g.nsub = 1;
844 for (i = 0; i < MAXSUB; ++i)
845 g.sub[i] = 0;
846
847 g.prog->flags = cflags;
848
849 next(&g);
850 node = parsealt(&g);
851 if (g.lookahead == ')')
852 die(&g, "unmatched ')'");
853 if (g.lookahead != 0)
854 die(&g, "syntax error");
855
856 #ifdef TEST
857 dumpnode(node);
858 putchar('\n');
859 #endif
860
861 n = 6 + count(&g, node);
862 if (n < 0 || n > MAXPROG)
863 die(&g, "program too large");
864
865 g.prog->nsub = g.nsub;
866 g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
867 if (!g.prog->start)
868 die(&g, "cannot allocate regular expression instruction list");
869
870 split = emit(g.prog, I_SPLIT);
871 split->x = split + 3;
872 split->y = split + 1;
873 emit(g.prog, I_ANYNL);
874 jump = emit(g.prog, I_JUMP);
875 jump->x = split;
876 emit(g.prog, I_LPAR);
877 compile(g.prog, node);
878 emit(g.prog, I_RPAR);
879 emit(g.prog, I_END);
880
881 #ifdef TEST
882 dumpprog(g.prog);
883 #endif
884
885 alloc(ctx, g.pstart, 0);
886
887 if (errorp) *errorp = NULL;
888 return g.prog;
889 }
890
regfreex(void * (* alloc)(void * ctx,void * p,int n),void * ctx,Reprog * prog)891 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
892 {
893 if (prog) {
894 alloc(ctx, prog->start, 0);
895 alloc(ctx, prog, 0);
896 }
897 }
898
default_alloc(void * ctx,void * p,int n)899 static void *default_alloc(void *ctx, void *p, int n)
900 {
901 return realloc(p, (size_t)n);
902 }
903
regcomp(const char * pattern,int cflags,const char ** errorp)904 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
905 {
906 return regcompx(default_alloc, NULL, pattern, cflags, errorp);
907 }
908
regfree(Reprog * prog)909 void regfree(Reprog *prog)
910 {
911 regfreex(default_alloc, NULL, prog);
912 }
913
914 /* Match */
915
isnewline(int c)916 static int isnewline(int c)
917 {
918 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
919 }
920
iswordchar(int c)921 static int iswordchar(int c)
922 {
923 return c == '_' ||
924 (c >= 'a' && c <= 'z') ||
925 (c >= 'A' && c <= 'Z') ||
926 (c >= '0' && c <= '9');
927 }
928
incclass(Reclass * cc,Rune c)929 static int incclass(Reclass *cc, Rune c)
930 {
931 Rune *p;
932 for (p = cc->spans; p < cc->end; p += 2)
933 if (p[0] <= c && c <= p[1])
934 return 1;
935 return 0;
936 }
937
incclasscanon(Reclass * cc,Rune c)938 static int incclasscanon(Reclass *cc, Rune c)
939 {
940 Rune *p, r;
941 for (p = cc->spans; p < cc->end; p += 2)
942 for (r = p[0]; r <= p[1]; ++r)
943 if (c == canon(r))
944 return 1;
945 return 0;
946 }
947
strncmpcanon(const char * a,const char * b,int n)948 static int strncmpcanon(const char *a, const char *b, int n)
949 {
950 Rune ra, rb;
951 int c;
952 while (n--) {
953 if (!*a) return -1;
954 if (!*b) return 1;
955 a += chartorune(&ra, a);
956 b += chartorune(&rb, b);
957 c = canon(ra) - canon(rb);
958 if (c)
959 return c;
960 }
961 return 0;
962 }
963
match(Reinst * pc,const char * sp,const char * bol,int flags,Resub * out)964 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
965 {
966 Resub scratch;
967 int i;
968 Rune c;
969
970 for (;;) {
971 switch (pc->opcode) {
972 case I_END:
973 return 1;
974 case I_JUMP:
975 pc = pc->x;
976 break;
977 case I_SPLIT:
978 scratch = *out;
979 if (match(pc->x, sp, bol, flags, &scratch)) {
980 *out = scratch;
981 return 1;
982 }
983 pc = pc->y;
984 break;
985
986 case I_PLA:
987 if (!match(pc->x, sp, bol, flags, out))
988 return 0;
989 pc = pc->y;
990 break;
991 case I_NLA:
992 scratch = *out;
993 if (match(pc->x, sp, bol, flags, &scratch))
994 return 0;
995 pc = pc->y;
996 break;
997
998 case I_ANYNL:
999 sp += chartorune(&c, sp);
1000 if (c == 0)
1001 return 0;
1002 pc = pc + 1;
1003 break;
1004 case I_ANY:
1005 sp += chartorune(&c, sp);
1006 if (c == 0)
1007 return 0;
1008 if (isnewline(c))
1009 return 0;
1010 pc = pc + 1;
1011 break;
1012 case I_CHAR:
1013 sp += chartorune(&c, sp);
1014 if (c == 0)
1015 return 0;
1016 if (flags & REG_ICASE)
1017 c = canon(c);
1018 if (c != pc->c)
1019 return 0;
1020 pc = pc + 1;
1021 break;
1022 case I_CCLASS:
1023 sp += chartorune(&c, sp);
1024 if (c == 0)
1025 return 0;
1026 if (flags & REG_ICASE) {
1027 if (!incclasscanon(pc->cc, canon(c)))
1028 return 0;
1029 } else {
1030 if (!incclass(pc->cc, c))
1031 return 0;
1032 }
1033 pc = pc + 1;
1034 break;
1035 case I_NCCLASS:
1036 sp += chartorune(&c, sp);
1037 if (c == 0)
1038 return 0;
1039 if (flags & REG_ICASE) {
1040 if (incclasscanon(pc->cc, canon(c)))
1041 return 0;
1042 } else {
1043 if (incclass(pc->cc, c))
1044 return 0;
1045 }
1046 pc = pc + 1;
1047 break;
1048 case I_REF:
1049 i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1050 if (flags & REG_ICASE) {
1051 if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1052 return 0;
1053 } else {
1054 if (strncmp(sp, out->sub[pc->n].sp, i))
1055 return 0;
1056 }
1057 if (i > 0)
1058 sp += i;
1059 pc = pc + 1;
1060 break;
1061
1062 case I_BOL:
1063 if (sp == bol && !(flags & REG_NOTBOL)) {
1064 pc = pc + 1;
1065 break;
1066 }
1067 if (flags & REG_NEWLINE) {
1068 if (sp > bol && isnewline(sp[-1])) {
1069 pc = pc + 1;
1070 break;
1071 }
1072 }
1073 return 0;
1074 case I_EOL:
1075 if (*sp == 0) {
1076 pc = pc + 1;
1077 break;
1078 }
1079 if (flags & REG_NEWLINE) {
1080 if (isnewline(*sp)) {
1081 pc = pc + 1;
1082 break;
1083 }
1084 }
1085 return 0;
1086 case I_WORD:
1087 i = sp > bol && iswordchar(sp[-1]);
1088 i ^= iswordchar(sp[0]);
1089 if (!i)
1090 return 0;
1091 pc = pc + 1;
1092 break;
1093 case I_NWORD:
1094 i = sp > bol && iswordchar(sp[-1]);
1095 i ^= iswordchar(sp[0]);
1096 if (i)
1097 return 0;
1098 pc = pc + 1;
1099 break;
1100
1101 case I_LPAR:
1102 out->sub[pc->n].sp = sp;
1103 pc = pc + 1;
1104 break;
1105 case I_RPAR:
1106 out->sub[pc->n].ep = sp;
1107 pc = pc + 1;
1108 break;
1109 default:
1110 return 0;
1111 }
1112 }
1113 }
1114
regexec(Reprog * prog,const char * sp,Resub * sub,int eflags)1115 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1116 {
1117 Resub scratch;
1118 int i;
1119
1120 if (!sub)
1121 sub = &scratch;
1122
1123 sub->nsub = prog->nsub;
1124 for (i = 0; i < MAXSUB; ++i)
1125 sub->sub[i].sp = sub->sub[i].ep = NULL;
1126
1127 return !match(prog->start, sp, sp, prog->flags | eflags, sub);
1128 }
1129
1130 #ifdef TEST
main(int argc,char ** argv)1131 int main(int argc, char **argv)
1132 {
1133 const char *error;
1134 const char *s;
1135 Reprog *p;
1136 Resub m;
1137 int i;
1138
1139 if (argc > 1) {
1140 p = regcomp(argv[1], 0, &error);
1141 if (!p) {
1142 fprintf(stderr, "regcomp: %s\n", error);
1143 return 1;
1144 }
1145
1146 if (argc > 2) {
1147 s = argv[2];
1148 printf("nsub = %d\n", p->nsub);
1149 if (!regexec(p, s, &m, 0)) {
1150 for (i = 0; i < m.nsub; ++i) {
1151 int n = m.sub[i].ep - m.sub[i].sp;
1152 if (n > 0)
1153 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1154 else
1155 printf("match %d: n=0 ''\n", i);
1156 }
1157 } else {
1158 printf("no match\n");
1159 }
1160 }
1161 }
1162
1163 return 0;
1164 }
1165 #endif
1166