1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
7  *
8  * See the COPYRIGHT file distributed with this work for additional
9  * information regarding copyright ownership.
10  */
11 
12 #include <config.h>
13 
14 #include <stdbool.h>
15 
16 #include <isc/file.h>
17 #include <isc/print.h>
18 #include <isc/regex.h>
19 #include <isc/string.h>
20 
21 #if VALREGEX_REPORT_REASON
22 #define FAIL(x) do { reason = (x); goto error; } while(0)
23 #else
24 #define FAIL(x) goto error
25 #endif
26 
27 /*
28  * Validate the regular expression 'C' locale.
29  */
30 int
isc_regex_validate(const char * c)31 isc_regex_validate(const char *c) {
32 	enum {
33 		none, parse_bracket, parse_bound,
34 		parse_ce, parse_ec, parse_cc
35 	} state = none;
36 	/* Well known character classes. */
37 	const char *cc[] = {
38 		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
39 		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
40 		":print:", ":xdigit:"
41 	};
42 	bool seen_comma = false;
43 	bool seen_high = false;
44 	bool seen_char = false;
45 	bool seen_ec = false;
46 	bool seen_ce = false;
47 	bool have_atom = false;
48 	int group = 0;
49 	int range = 0;
50 	int sub = 0;
51 	bool empty_ok = false;
52 	bool neg = false;
53 	bool was_multiple = false;
54 	unsigned int low = 0;
55 	unsigned int high = 0;
56 	const char *ccname = NULL;
57 	int range_start = 0;
58 #if VALREGEX_REPORT_REASON
59 	const char *reason = "";
60 #endif
61 
62 	if (c == NULL || *c == 0)
63 		FAIL("empty string");
64 
65 	while (c != NULL && *c != 0) {
66 		switch (state) {
67 		case none:
68 			switch (*c) {
69 			case '\\':	/* make literal */
70 				++c;
71 				switch (*c) {
72 				case '1': case '2': case '3':
73 				case '4': case '5': case '6':
74 				case '7': case '8': case '9':
75 					if ((*c - '0') > sub)
76 						FAIL("bad back reference");
77 					have_atom = true;
78 					was_multiple = false;
79 					break;
80 				case 0:
81 					FAIL("escaped end-of-string");
82 				default:
83 					goto literal;
84 				}
85 				++c;
86 				break;
87 			case '[':	/* bracket start */
88 				++c;
89 				neg = false;
90 				was_multiple = false;
91 				seen_char = false;
92 				state = parse_bracket;
93 				break;
94 			case '{': 	/* bound start */
95 				switch (c[1]) {
96 				case '0': case '1': case '2': case '3':
97 				case '4': case '5': case '6': case '7':
98 				case '8': case '9':
99 					if (!have_atom)
100 						FAIL("no atom");
101 					if (was_multiple)
102 						FAIL("was multiple");
103 					seen_comma = false;
104 					seen_high = false;
105 					low = high = 0;
106 					state = parse_bound;
107 					break;
108 				default:
109 					goto literal;
110 				}
111 				++c;
112 				have_atom = true;
113 				was_multiple = true;
114 				break;
115 			case '}':
116 				goto literal;
117 			case '(':	/* group start */
118 				have_atom = false;
119 				was_multiple = false;
120 				empty_ok = true;
121 				++group;
122 				++sub;
123 				++c;
124 				break;
125 			case ')':	/* group end */
126 				if (group && !have_atom && !empty_ok)
127 					FAIL("empty alternative");
128 				have_atom = true;
129 				was_multiple = false;
130 				if (group != 0)
131 					--group;
132 				++c;
133 				break;
134 			case '|':	/* alternative separator */
135 				if (!have_atom)
136 					FAIL("no atom");
137 				have_atom = false;
138 				empty_ok = false;
139 				was_multiple = false;
140 				++c;
141 				break;
142 			case '^':
143 			case '$':
144 				have_atom = true;
145 				was_multiple = true;
146 				++c;
147 				break;
148 			case '+':
149 			case '*':
150 			case '?':
151 				if (was_multiple)
152 					FAIL("was multiple");
153 				if (!have_atom)
154 					FAIL("no atom");
155 				have_atom = true;
156 				was_multiple = true;
157 				++c;
158 				break;
159 			case '.':
160 			default:
161 			literal:
162 				have_atom = true;
163 				was_multiple = false;
164 				++c;
165 				break;
166 			}
167 			break;
168 		case parse_bound:
169 			switch (*c) {
170 			case '0': case '1': case '2': case '3': case '4':
171 			case '5': case '6': case '7': case '8': case '9':
172 				if (!seen_comma) {
173 					low = low * 10 + *c - '0';
174 					if (low > 255)
175 						FAIL("lower bound too big");
176 				} else {
177 					seen_high = true;
178 					high = high * 10 + *c - '0';
179 					if (high > 255)
180 						FAIL("upper bound too big");
181 				}
182 				++c;
183 				break;
184 			case ',':
185 				if (seen_comma)
186 					FAIL("multiple commas");
187 				seen_comma = true;
188 				++c;
189 				break;
190 			default:
191 			case '{':
192 				FAIL("non digit/comma");
193 			case '}':
194 				if (seen_high && low > high)
195 					FAIL("bad parse bound");
196 				seen_comma = false;
197 				state = none;
198 				++c;
199 				break;
200 			}
201 			break;
202 		case parse_bracket:
203 			switch (*c) {
204 			case '^':
205 				if (seen_char || neg) goto inside;
206 				neg = true;
207 				++c;
208 				break;
209 			case '-':
210 				if (range == 2) goto inside;
211 				if (!seen_char) goto inside;
212 				if (range == 1)
213 					FAIL("bad range");
214 				range = 2;
215 				++c;
216 				break;
217 			case '[':
218 				++c;
219 				switch (*c) {
220 				case '.':	/* collating element */
221 					if (range != 0) --range;
222 					++c;
223 					state = parse_ce;
224 					seen_ce = false;
225 					break;
226 				case '=':	/* equivalence class */
227 					if (range == 2)
228 					    FAIL("equivalence class in range");
229 					++c;
230 					state = parse_ec;
231 					seen_ec = false;
232 					break;
233 				case ':':	/* character class */
234 					if (range == 2)
235 					      FAIL("character class in range");
236 					ccname = c;
237 					++c;
238 					state = parse_cc;
239 					break;
240 				}
241 				seen_char = true;
242 				break;
243 			case ']':
244 				if (!c[1] && !seen_char)
245 					FAIL("unfinished brace");
246 				if (!seen_char)
247 					goto inside;
248 				++c;
249 				range = 0;
250 				have_atom = true;
251 				state = none;
252 				break;
253 			default:
254 			inside:
255 				seen_char = true;
256 				if (range == 2 && (*c & 0xff) < range_start)
257 					FAIL("out of order range");
258 				if (range != 0)
259 					--range;
260 				range_start = *c & 0xff;
261 				++c;
262 				break;
263 			};
264 			break;
265 		case parse_ce:
266 			switch (*c) {
267 			case '.':
268 				++c;
269 				switch (*c) {
270 				case ']':
271 					if (!seen_ce)
272 						 FAIL("empty ce");
273 					++c;
274 					state = parse_bracket;
275 					break;
276 				default:
277 					if (seen_ce)
278 						range_start = 256;
279 					else
280 						range_start = '.';
281 					seen_ce = true;
282 					break;
283 				}
284 				break;
285 			default:
286 				if (seen_ce)
287 					range_start = 256;
288 				else
289 					range_start = *c;
290 				seen_ce = true;
291 				++c;
292 				break;
293 			}
294 			break;
295 		case parse_ec:
296 			switch (*c) {
297 			case '=':
298 				++c;
299 				switch (*c) {
300 				case ']':
301 					if (!seen_ec)
302 						FAIL("no ec");
303 					++c;
304 					state = parse_bracket;
305 					break;
306 				default:
307 					seen_ec = true;
308 					break;
309 				}
310 				break;
311 			default:
312 				seen_ec = true;
313 				++c;
314 				break;
315 			}
316 			break;
317 		case parse_cc:
318 			switch (*c) {
319 			case ':':
320 				++c;
321 				switch (*c) {
322 				case ']': {
323 					unsigned int i;
324 					bool found = false;
325 					for (i = 0;
326 					     i < sizeof(cc)/sizeof(*cc);
327 					     i++)
328 					{
329 						unsigned int len;
330 						len = strlen(cc[i]);
331 						if (len !=
332 						    (unsigned int)(c - ccname))
333 							continue;
334 						if (strncmp(cc[i], ccname, len))
335 							continue;
336 						found = true;
337 					}
338 					if (!found)
339 						FAIL("unknown cc");
340 					++c;
341 					state = parse_bracket;
342 					break;
343 					}
344 				default:
345 					break;
346 				}
347 				break;
348 			default:
349 				++c;
350 				break;
351 			}
352 			break;
353 		}
354 	}
355 	if (group != 0)
356 		FAIL("group open");
357 	if (state != none)
358 		FAIL("incomplete");
359 	if (!have_atom)
360 		FAIL("no atom");
361 	return (sub);
362 
363  error:
364 #if VALREGEX_REPORT_REASON
365 	fprintf(stderr, "%s\n", reason);
366 #endif
367 	return (-1);
368 }
369