1 /*	$NetBSD: parse_rx.c,v 1.1.1.1 2008/12/22 00:18:37 haad Exp $	*/
2 
3 /*
4  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5  * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6  *
7  * This file is part of the device-mapper userspace tools.
8  *
9  * This copyrighted material is made available to anyone wishing to use,
10  * modify, copy, or redistribute it subject to the terms and conditions
11  * of the GNU Lesser General Public License v.2.1.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this program; if not, write to the Free Software Foundation,
15  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16  */
17 
18 #include "dmlib.h"
19 #include "parse_rx.h"
20 
21 struct parse_sp {		/* scratch pad for the parsing process */
22 	struct dm_pool *mem;
23 	int type;		/* token type, 0 indicates a charset */
24 	dm_bitset_t charset;	/* The current charset */
25 	const char *cursor;	/* where we are in the regex */
26 	const char *rx_end;	/* 1pte for the expression being parsed */
27 };
28 
29 static struct rx_node *_or_term(struct parse_sp *ps);
30 
31 static void _single_char(struct parse_sp *ps, unsigned int c, const char *ptr)
32 {
33 	ps->type = 0;
34 	ps->cursor = ptr + 1;
35 	dm_bit_clear_all(ps->charset);
36 	dm_bit_set(ps->charset, c);
37 }
38 
39 /*
40  * Get the next token from the regular expression.
41  * Returns: 1 success, 0 end of input, -1 error.
42  */
43 static int _rx_get_token(struct parse_sp *ps)
44 {
45 	int neg = 0, range = 0;
46 	char c, lc = 0;
47 	const char *ptr = ps->cursor;
48 	if (ptr == ps->rx_end) {	/* end of input ? */
49 		ps->type = -1;
50 		return 0;
51 	}
52 
53 	switch (*ptr) {
54 		/* charsets and ncharsets */
55 	case '[':
56 		ptr++;
57 		if (*ptr == '^') {
58 			dm_bit_set_all(ps->charset);
59 
60 			/* never transition on zero */
61 			dm_bit_clear(ps->charset, 0);
62 			neg = 1;
63 			ptr++;
64 
65 		} else
66 			dm_bit_clear_all(ps->charset);
67 
68 		while ((ptr < ps->rx_end) && (*ptr != ']')) {
69 			if (*ptr == '\\') {
70 				/* an escaped character */
71 				ptr++;
72 				switch (*ptr) {
73 				case 'n':
74 					c = '\n';
75 					break;
76 				case 'r':
77 					c = '\r';
78 					break;
79 				case 't':
80 					c = '\t';
81 					break;
82 				default:
83 					c = *ptr;
84 				}
85 			} else if (*ptr == '-' && lc) {
86 				/* we've got a range on our hands */
87 				range = 1;
88 				ptr++;
89 				if (ptr == ps->rx_end) {
90 					log_error("Incomplete range"
91 						  "specification");
92 					return -1;
93 				}
94 				c = *ptr;
95 			} else
96 				c = *ptr;
97 
98 			if (range) {
99 				/* add lc - c into the bitset */
100 				if (lc > c) {
101 					char tmp = c;
102 					c = lc;
103 					lc = tmp;
104 				}
105 
106 				for (; lc <= c; lc++) {
107 					if (neg)
108 						dm_bit_clear(ps->charset, lc);
109 					else
110 						dm_bit_set(ps->charset, lc);
111 				}
112 				range = 0;
113 			} else {
114 				/* add c into the bitset */
115 				if (neg)
116 					dm_bit_clear(ps->charset, c);
117 				else
118 					dm_bit_set(ps->charset, c);
119 			}
120 			ptr++;
121 			lc = c;
122 		}
123 
124 		if (ptr >= ps->rx_end) {
125 			ps->type = -1;
126 			return -1;
127 		}
128 
129 		ps->type = 0;
130 		ps->cursor = ptr + 1;
131 		break;
132 
133 		/* These characters are special, we just return their ASCII
134 		   codes as the type.  Sorted into ascending order to help the
135 		   compiler */
136 	case '(':
137 	case ')':
138 	case '*':
139 	case '+':
140 	case '?':
141 	case '|':
142 		ps->type = (int) *ptr;
143 		ps->cursor = ptr + 1;
144 		break;
145 
146 	case '^':
147 		_single_char(ps, HAT_CHAR, ptr);
148 		break;
149 
150 	case '$':
151 		_single_char(ps, DOLLAR_CHAR, ptr);
152 		break;
153 
154 	case '.':
155 		/* The 'all but newline' character set */
156 		ps->type = 0;
157 		ps->cursor = ptr + 1;
158 		dm_bit_set_all(ps->charset);
159 		dm_bit_clear(ps->charset, (int) '\n');
160 		dm_bit_clear(ps->charset, (int) '\r');
161 		dm_bit_clear(ps->charset, 0);
162 		break;
163 
164 	case '\\':
165 		/* escaped character */
166 		ptr++;
167 		if (ptr >= ps->rx_end) {
168 			log_error("Badly quoted character at end "
169 				  "of expression");
170 			ps->type = -1;
171 			return -1;
172 		}
173 
174 		ps->type = 0;
175 		ps->cursor = ptr + 1;
176 		dm_bit_clear_all(ps->charset);
177 		switch (*ptr) {
178 		case 'n':
179 			dm_bit_set(ps->charset, (int) '\n');
180 			break;
181 		case 'r':
182 			dm_bit_set(ps->charset, (int) '\r');
183 			break;
184 		case 't':
185 			dm_bit_set(ps->charset, (int) '\t');
186 			break;
187 		default:
188 			dm_bit_set(ps->charset, (int) *ptr);
189 		}
190 		break;
191 
192 	default:
193 		/* add a single character to the bitset */
194 		ps->type = 0;
195 		ps->cursor = ptr + 1;
196 		dm_bit_clear_all(ps->charset);
197 		dm_bit_set(ps->charset, (int) *ptr);
198 		break;
199 	}
200 
201 	return 1;
202 }
203 
204 static struct rx_node *_node(struct dm_pool *mem, int type,
205 			     struct rx_node *l, struct rx_node *r)
206 {
207 	struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n));
208 
209 	if (n) {
210 		if (!(n->charset = dm_bitset_create(mem, 256))) {
211 			dm_pool_free(mem, n);
212 			return NULL;
213 		}
214 
215 		n->type = type;
216 		n->left = l;
217 		n->right = r;
218 	}
219 
220 	return n;
221 }
222 
223 static struct rx_node *_term(struct parse_sp *ps)
224 {
225 	struct rx_node *n;
226 
227 	switch (ps->type) {
228 	case 0:
229 		if (!(n = _node(ps->mem, CHARSET, NULL, NULL))) {
230 			stack;
231 			return NULL;
232 		}
233 
234 		dm_bit_copy(n->charset, ps->charset);
235 		_rx_get_token(ps);	/* match charset */
236 		break;
237 
238 	case '(':
239 		_rx_get_token(ps);	/* match '(' */
240 		n = _or_term(ps);
241 		if (ps->type != ')') {
242 			log_error("missing ')' in regular expression");
243 			return 0;
244 		}
245 		_rx_get_token(ps);	/* match ')' */
246 		break;
247 
248 	default:
249 		n = 0;
250 	}
251 
252 	return n;
253 }
254 
255 static struct rx_node *_closure_term(struct parse_sp *ps)
256 {
257 	struct rx_node *l, *n;
258 
259 	if (!(l = _term(ps)))
260 		return NULL;
261 
262 	for (;;) {
263 		switch (ps->type) {
264 		case '*':
265 			n = _node(ps->mem, STAR, l, NULL);
266 			break;
267 
268 		case '+':
269 			n = _node(ps->mem, PLUS, l, NULL);
270 			break;
271 
272 		case '?':
273 			n = _node(ps->mem, QUEST, l, NULL);
274 			break;
275 
276 		default:
277 			return l;
278 		}
279 
280 		if (!n) {
281 			stack;
282 			return NULL;
283 		}
284 
285 		_rx_get_token(ps);
286 		l = n;
287 	}
288 
289 	return n;
290 }
291 
292 static struct rx_node *_cat_term(struct parse_sp *ps)
293 {
294 	struct rx_node *l, *r, *n;
295 
296 	if (!(l = _closure_term(ps)))
297 		return NULL;
298 
299 	if (ps->type == '|')
300 		return l;
301 
302 	if (!(r = _cat_term(ps)))
303 		return l;
304 
305 	if (!(n = _node(ps->mem, CAT, l, r)))
306 		stack;
307 
308 	return n;
309 }
310 
311 static struct rx_node *_or_term(struct parse_sp *ps)
312 {
313 	struct rx_node *l, *r, *n;
314 
315 	if (!(l = _cat_term(ps)))
316 		return NULL;
317 
318 	if (ps->type != '|')
319 		return l;
320 
321 	_rx_get_token(ps);		/* match '|' */
322 
323 	if (!(r = _or_term(ps))) {
324 		log_error("Badly formed 'or' expression");
325 		return NULL;
326 	}
327 
328 	if (!(n = _node(ps->mem, OR, l, r)))
329 		stack;
330 
331 	return n;
332 }
333 
334 struct rx_node *rx_parse_tok(struct dm_pool *mem,
335 			     const char *begin, const char *end)
336 {
337 	struct rx_node *r;
338 	struct parse_sp *ps = dm_pool_zalloc(mem, sizeof(*ps));
339 
340 	if (!ps) {
341 		stack;
342 		return NULL;
343 	}
344 
345 	ps->mem = mem;
346 	ps->charset = dm_bitset_create(mem, 256);
347 	ps->cursor = begin;
348 	ps->rx_end = end;
349 	_rx_get_token(ps);		/* load the first token */
350 
351 	if (!(r = _or_term(ps))) {
352 		log_error("Parse error in regex");
353 		dm_pool_free(mem, ps);
354 	}
355 
356 	return r;
357 }
358 
359 struct rx_node *rx_parse_str(struct dm_pool *mem, const char *str)
360 {
361 	return rx_parse_tok(mem, str, str + strlen(str));
362 }
363