1 // Copyright 2014 Paul Sokolovsky.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "re1.5.h"
6 
7 #define INSERT_CODE(at, num, pc) \
8     ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
9 #define REL(at, to) (to - at - 2)
10 #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
11 #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
12 #define PC (prog->bytelen)
13 
_emit_checked(int at,char * code,int val,bool * err)14 static void _emit_checked(int at, char *code, int val, bool *err) {
15     *err |= val != (int8_t)val;
16     if (code) {
17         code[at] = val;
18     }
19 }
20 
_compilecode(const char * re,ByteProg * prog,int sizecode)21 static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
22 {
23     char *code = sizecode ? NULL : prog->insts;
24     bool err = false;
25     int start = PC;
26     int term = PC;
27     int alt_label = 0;
28 
29     for (; *re && *re != ')'; re++) {
30         switch (*re) {
31         case '\\':
32             re++;
33             if (!*re) return NULL; // Trailing backslash
34             if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
35                 term = PC;
36                 EMIT(PC++, NamedClass);
37                 EMIT(PC++, *re);
38                 prog->len++;
39                 break;
40             }
41             MP_FALLTHROUGH
42         default:
43             term = PC;
44             EMIT(PC++, Char);
45             EMIT(PC++, *re);
46             prog->len++;
47             break;
48         case '.':
49             term = PC;
50             EMIT(PC++, Any);
51             prog->len++;
52             break;
53         case '[': {
54             int cnt;
55             term = PC;
56             re++;
57             if (*re == '^') {
58                 EMIT(PC++, ClassNot);
59                 re++;
60             } else {
61                 EMIT(PC++, Class);
62             }
63             PC++; // Skip # of pair byte
64             prog->len++;
65             for (cnt = 0; *re != ']'; re++, cnt++) {
66                 if (*re == '\\') {
67                     ++re;
68                 }
69                 if (!*re) return NULL;
70                 EMIT(PC++, *re);
71                 if (re[1] == '-' && re[2] != ']') {
72                     re += 2;
73                 }
74                 EMIT(PC++, *re);
75             }
76             EMIT_CHECKED(term + 1, cnt);
77             break;
78         }
79         case '(': {
80             term = PC;
81             int sub = 0;
82             int capture = re[1] != '?' || re[2] != ':';
83 
84             if (capture) {
85                 sub = ++prog->sub;
86                 EMIT(PC++, Save);
87                 EMIT_CHECKED(PC++, 2 * sub);
88                 prog->len++;
89             } else {
90                     re += 2;
91             }
92 
93             re = _compilecode(re + 1, prog, sizecode);
94             if (re == NULL || *re != ')') return NULL; // error, or no matching paren
95 
96             if (capture) {
97                 EMIT(PC++, Save);
98                 EMIT_CHECKED(PC++, 2 * sub + 1);
99                 prog->len++;
100             }
101 
102             break;
103         }
104         case '?':
105             if (PC == term) return NULL; // nothing to repeat
106             INSERT_CODE(term, 2, PC);
107             if (re[1] == '?') {
108                 EMIT(term, RSplit);
109                 re++;
110             } else {
111                 EMIT(term, Split);
112             }
113             EMIT_CHECKED(term + 1, REL(term, PC));
114             prog->len++;
115             term = PC;
116             break;
117         case '*':
118             if (PC == term) return NULL; // nothing to repeat
119             INSERT_CODE(term, 2, PC);
120             EMIT(PC, Jmp);
121             EMIT_CHECKED(PC + 1, REL(PC, term));
122             PC += 2;
123             if (re[1] == '?') {
124                 EMIT(term, RSplit);
125                 re++;
126             } else {
127                 EMIT(term, Split);
128             }
129             EMIT_CHECKED(term + 1, REL(term, PC));
130             prog->len += 2;
131             term = PC;
132             break;
133         case '+':
134             if (PC == term) return NULL; // nothing to repeat
135             if (re[1] == '?') {
136                 EMIT(PC, Split);
137                 re++;
138             } else {
139                 EMIT(PC, RSplit);
140             }
141             EMIT_CHECKED(PC + 1, REL(PC, term));
142             PC += 2;
143             prog->len++;
144             term = PC;
145             break;
146         case '|':
147             if (alt_label) {
148                 EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
149             }
150             INSERT_CODE(start, 2, PC);
151             EMIT(PC++, Jmp);
152             alt_label = PC++;
153             EMIT(start, Split);
154             EMIT_CHECKED(start + 1, REL(start, PC));
155             prog->len += 2;
156             term = PC;
157             break;
158         case '^':
159             EMIT(PC++, Bol);
160             prog->len++;
161             term = PC;
162             break;
163         case '$':
164             EMIT(PC++, Eol);
165             prog->len++;
166             term = PC;
167             break;
168         }
169     }
170 
171     if (alt_label) {
172         EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
173     }
174     return err ? NULL : re;
175 }
176 
re1_5_sizecode(const char * re)177 int re1_5_sizecode(const char *re)
178 {
179     ByteProg dummyprog = {
180          // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
181         .bytelen = 5 + NON_ANCHORED_PREFIX
182     };
183 
184     if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
185 
186     return dummyprog.bytelen;
187 }
188 
re1_5_compilecode(ByteProg * prog,const char * re)189 int re1_5_compilecode(ByteProg *prog, const char *re)
190 {
191     prog->len = 0;
192     prog->bytelen = 0;
193     prog->sub = 0;
194 
195     // Add code to implement non-anchored operation ("search"),
196     // for anchored operation ("match"), this code will be just skipped.
197     // TODO: Implement search in much more efficient manner
198     prog->insts[prog->bytelen++] = RSplit;
199     prog->insts[prog->bytelen++] = 3;
200     prog->insts[prog->bytelen++] = Any;
201     prog->insts[prog->bytelen++] = Jmp;
202     prog->insts[prog->bytelen++] = -5;
203     prog->len += 3;
204 
205     prog->insts[prog->bytelen++] = Save;
206     prog->insts[prog->bytelen++] = 0;
207     prog->len++;
208 
209     re = _compilecode(re, prog, /*sizecode*/0);
210     if (re == NULL || *re) return 1;
211 
212     prog->insts[prog->bytelen++] = Save;
213     prog->insts[prog->bytelen++] = 1;
214     prog->len++;
215 
216     prog->insts[prog->bytelen++] = Match;
217     prog->len++;
218 
219     return 0;
220 }
221 
222 #if 0
223 int main(int argc, char *argv[])
224 {
225     int pc = 0;
226     ByteProg *code = re1_5_compilecode(argv[1]);
227     re1_5_dumpcode(code);
228 }
229 #endif
230