xref: /openbsd/usr.bin/dig/lib/isc/regex.c (revision b73bdc82)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include <isc/regex.h>
18 #include <isc/types.h>
19 #include <string.h>
20 
21 /*
22  * Validate the regular expression 'C' locale.
23  */
24 int
isc_regex_validate(const char * c)25 isc_regex_validate(const char *c) {
26 	enum {
27 		none, parse_bracket, parse_bound,
28 		parse_ce, parse_ec, parse_cc
29 	} state = none;
30 	/* Well known character classes. */
31 	const char *cc[] = {
32 		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
33 		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
34 		":print:", ":xdigit:"
35 	};
36 	int seen_comma = 0;
37 	int seen_high = 0;
38 	int seen_char = 0;
39 	int seen_ec = 0;
40 	int seen_ce = 0;
41 	int have_atom = 0;
42 	int group = 0;
43 	int range = 0;
44 	int sub = 0;
45 	int empty_ok = 0;
46 	int neg = 0;
47 	int was_multiple = 0;
48 	unsigned int low = 0;
49 	unsigned int high = 0;
50 	const char *ccname = NULL;
51 	int range_start = 0;
52 
53 	if (c == NULL || *c == 0)
54 		return(-1);
55 
56 	while (c != NULL && *c != 0) {
57 		switch (state) {
58 		case none:
59 			switch (*c) {
60 			case '\\':	/* make literal */
61 				++c;
62 				switch (*c) {
63 				case '1': case '2': case '3':
64 				case '4': case '5': case '6':
65 				case '7': case '8': case '9':
66 					if ((*c - '0') > sub)
67 						return(-1);
68 					have_atom = 1;
69 					was_multiple = 0;
70 					break;
71 				case 0:
72 					return(-1);
73 				default:
74 					goto literal;
75 				}
76 				++c;
77 				break;
78 			case '[':	/* bracket start */
79 				++c;
80 				neg = 0;
81 				was_multiple = 0;
82 				seen_char = 0;
83 				state = parse_bracket;
84 				break;
85 			case '{': 	/* bound start */
86 				switch (c[1]) {
87 				case '0': case '1': case '2': case '3':
88 				case '4': case '5': case '6': case '7':
89 				case '8': case '9':
90 					if (!have_atom)
91 						return(-1);
92 					if (was_multiple)
93 						return(-1);
94 					seen_comma = 0;
95 					seen_high = 0;
96 					low = high = 0;
97 					state = parse_bound;
98 					break;
99 				default:
100 					goto literal;
101 				}
102 				++c;
103 				have_atom = 1;
104 				was_multiple = 1;
105 				break;
106 			case '}':
107 				goto literal;
108 			case '(':	/* group start */
109 				have_atom = 0;
110 				was_multiple = 0;
111 				empty_ok = 1;
112 				++group;
113 				++sub;
114 				++c;
115 				break;
116 			case ')':	/* group end */
117 				if (group && !have_atom && !empty_ok)
118 					return(-1);
119 				have_atom = 1;
120 				was_multiple = 0;
121 				if (group != 0)
122 					--group;
123 				++c;
124 				break;
125 			case '|':	/* alternative separator */
126 				if (!have_atom)
127 					return(-1);
128 				have_atom = 0;
129 				empty_ok = 0;
130 				was_multiple = 0;
131 				++c;
132 				break;
133 			case '^':
134 			case '$':
135 				have_atom = 1;
136 				was_multiple = 1;
137 				++c;
138 				break;
139 			case '+':
140 			case '*':
141 			case '?':
142 				if (was_multiple)
143 					return(-1);
144 				if (!have_atom)
145 					return(-1);
146 				have_atom = 1;
147 				was_multiple = 1;
148 				++c;
149 				break;
150 			case '.':
151 			default:
152 			literal:
153 				have_atom = 1;
154 				was_multiple = 0;
155 				++c;
156 				break;
157 			}
158 			break;
159 		case parse_bound:
160 			switch (*c) {
161 			case '0': case '1': case '2': case '3': case '4':
162 			case '5': case '6': case '7': case '8': case '9':
163 				if (!seen_comma) {
164 					low = low * 10 + *c - '0';
165 					if (low > 255)
166 						return(-1);
167 				} else {
168 					seen_high = 1;
169 					high = high * 10 + *c - '0';
170 					if (high > 255)
171 						return(-1);
172 				}
173 				++c;
174 				break;
175 			case ',':
176 				if (seen_comma)
177 					return(-1);
178 				seen_comma = 1;
179 				++c;
180 				break;
181 			default:
182 			case '{':
183 				return(-1);
184 			case '}':
185 				if (seen_high && low > high)
186 					return(-1);
187 				seen_comma = 0;
188 				state = none;
189 				++c;
190 				break;
191 			}
192 			break;
193 		case parse_bracket:
194 			switch (*c) {
195 			case '^':
196 				if (seen_char || neg) goto inside;
197 				neg = 1;
198 				++c;
199 				break;
200 			case '-':
201 				if (range == 2) goto inside;
202 				if (!seen_char) goto inside;
203 				if (range == 1)
204 					return(-1);
205 				range = 2;
206 				++c;
207 				break;
208 			case '[':
209 				++c;
210 				switch (*c) {
211 				case '.':	/* collating element */
212 					if (range != 0) --range;
213 					++c;
214 					state = parse_ce;
215 					seen_ce = 0;
216 					break;
217 				case '=':	/* equivalence class */
218 					if (range == 2)
219 						return(-1);
220 					++c;
221 					state = parse_ec;
222 					seen_ec = 0;
223 					break;
224 				case ':':	/* character class */
225 					if (range == 2)
226 						return(-1);
227 					ccname = c;
228 					++c;
229 					state = parse_cc;
230 					break;
231 				}
232 				seen_char = 1;
233 				break;
234 			case ']':
235 				if (!c[1] && !seen_char)
236 					return(-1);
237 				if (!seen_char)
238 					goto inside;
239 				++c;
240 				range = 0;
241 				have_atom = 1;
242 				state = none;
243 				break;
244 			default:
245 			inside:
246 				seen_char = 1;
247 				if (range == 2 && (*c & 0xff) < range_start)
248 					return(-1);
249 				if (range != 0)
250 					--range;
251 				range_start = *c & 0xff;
252 				++c;
253 				break;
254 			};
255 			break;
256 		case parse_ce:
257 			switch (*c) {
258 			case '.':
259 				++c;
260 				switch (*c) {
261 				case ']':
262 					if (!seen_ce)
263 						return(-1);
264 					++c;
265 					state = parse_bracket;
266 					break;
267 				default:
268 					if (seen_ce)
269 						range_start = 256;
270 					else
271 						range_start = '.';
272 					seen_ce = 1;
273 					break;
274 				}
275 				break;
276 			default:
277 				if (seen_ce)
278 					range_start = 256;
279 				else
280 					range_start = *c;
281 				seen_ce = 1;
282 				++c;
283 				break;
284 			}
285 			break;
286 		case parse_ec:
287 			switch (*c) {
288 			case '=':
289 				++c;
290 				switch (*c) {
291 				case ']':
292 					if (!seen_ec)
293 						return(-1);
294 					++c;
295 					state = parse_bracket;
296 					break;
297 				default:
298 					seen_ec = 1;
299 					break;
300 				}
301 				break;
302 			default:
303 				seen_ec = 1;
304 				++c;
305 				break;
306 			}
307 			break;
308 		case parse_cc:
309 			switch (*c) {
310 			case ':':
311 				++c;
312 				switch (*c) {
313 				case ']': {
314 					unsigned int i;
315 					int found = 0;
316 					for (i = 0;
317 					     i < sizeof(cc)/sizeof(*cc);
318 					     i++)
319 					{
320 						unsigned int len;
321 						len = strlen(cc[i]);
322 						if (len !=
323 						    (unsigned int)(c - ccname))
324 							continue;
325 						if (strncmp(cc[i], ccname, len))
326 							continue;
327 						found = 1;
328 					}
329 					if (!found)
330 						return(-1);
331 					++c;
332 					state = parse_bracket;
333 					break;
334 					}
335 				default:
336 					break;
337 				}
338 				break;
339 			default:
340 				++c;
341 				break;
342 			}
343 			break;
344 		}
345 	}
346 	if (group != 0)
347 		return(-1);
348 	if (state != none)
349 		return(-1);
350 	if (!have_atom)
351 		return(-1);
352 	return (sub);
353 }
354