1 /* tree.c - parse trees for regexps
2 *
3 ****************************************************************
4 * Copyright (C) 1998, 2000 Thomas Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/char/str.h"
14 #include "hackerlab/mem/mem.h"
15 #include "hackerlab/rx/nfa-cache.h"
16 #include "hackerlab/rx/tree.h"
17
18
19 static unsigned long exps_allocated = 0;
20 static unsigned long exps_freed = 0;
21
22 /****************************************************************
23 *(h0 "Regexp Expression Trees"
24 * :subtitle "rx/rexp.c"
25 * :includes ("rx/tree.h"))
26 *
27 *
28 * Every regexp has a natural representation as a syntax tree. The
29 * type `struct rx_exp_node' is used to represent one node of such a
30 * tree. Functions are provided that construct and manipulate
31 * expression trees representing regexps.
32 *
33 * Syntax trees can be constructed directly by programs as a way to
34 * specify a regexp and strings obeying standard regexp syntax can be
35 * parsed to produce syntax trees. Regexp parsing is explained in
36 * xref:"Regexp Parsing". The topic of this chapter is the output
37 * of parsing: the data structure used to represent a regexp syntax
38 * tree.
39 *
40 * Rx is able to compute some analytic (recursively defined)
41 * properties of regexp expression tree nodes. Therefore, the Rx
42 * representation of expression tree nodes contains some "extra"
43 * fields to hold that data.
44 *
45 * Rx is able to translate an expression tree into an NFA and
46 * indirectly into a DFA. The performance of this translation is
47 * commonly critical and is greatly improved by caching information
48 * in still more "extra" fields of expression nodes.
49 */
50 /*(menu)
51 */
52
53 /*(include-documentation "tree.h")
54 */
55
56 /****************************************************************
57 *(h1 "Regexp Tree Allocation and Initialization")
58 */
59
60 /*(c rx_exp_node)
61 * struct rx_exp_node * rx_exp_node (int type);
62 *
63 * Allocate a new expression node and initialize its type
64 * and reference count. The reference count is initialized to 1,
65 * the type to `type', and all other fields to 0.
66 *
67 * If the allocation fails, return 0.
68 *
69 */
70 struct rx_exp_node *
rx_exp_node(enum rx_exp_node_type type)71 rx_exp_node (enum rx_exp_node_type type)
72 {
73 struct rx_exp_node *n;
74
75 n = (struct rx_exp_node *) rx_nfa_cache_malloc (sizeof (*n));
76 if (!n)
77 return 0;
78 ++exps_allocated;
79 mem_set0 ((char *)n, sizeof (*n));
80 n->type = type;
81 n->refs = 1;
82 return n;
83 }
84
85
86 /*(c rx_mk_r_cset)
87 * struct rx_exp_node * rx_mk_r_cset (enum rx_exp_node_type type,
88 * int size, bits b);
89 *
90 * Allocate a new character set expression node. The reference count
91 * is initialized to 1, the type to `type', the `cset' field to a
92 * newly allocated copy of `b', the `cset_size' field to `size', and
93 * all other fields to 0.
94 *
95 * If any allocation fails, return 0;
96 *
97 * `type' should be `r_cset'.
98 */
99 struct rx_exp_node *
rx_mk_r_cset(enum rx_exp_node_type type,int size,bits b)100 rx_mk_r_cset (enum rx_exp_node_type type, int size, bits b)
101 {
102 struct rx_exp_node * n;
103
104 if (type != r_cset)
105 panic ("unreasonable rexp node type in rx_mk_r_cset");
106
107 n = rx_exp_node (type);
108 if (!n)
109 return 0;
110 n->cset = bits_dup (b);
111 if (!n->cset)
112 {
113 rx_free_exp (n);
114 return 0;
115 }
116 n->cset_size = size;
117 return n;
118 }
119
120
121 /*(c rx_mk_r_cset_take)
122 * struct rx_exp_node * rx_mk_r_cset_take (enum rx_exp_node_type type,
123 * int size, bits b);
124 *
125 * Allocate a new character set expression node. The reference count
126 * is initialized to 1, the type to `type', the `cset' field to a
127 * newly allocated copy of `b', the `cset_size' field to `size', and
128 * all other fields to 0.
129 *
130 * If any allocation fails, return 0.
131 *
132 * `type' should be `r_cset'.
133 */
134 struct rx_exp_node *
rx_mk_r_cset_take(enum rx_exp_node_type type,int size,bits b)135 rx_mk_r_cset_take (enum rx_exp_node_type type, int size, bits b)
136 {
137 struct rx_exp_node * n;
138
139 if (type != r_cset)
140 panic ("unreasonable rexp node type in rx_mk_r_cset_take");
141 n = rx_exp_node (type);
142 if (!n)
143 return 0;
144 n->cset = b;
145 n->cset_size = size;
146 return n;
147 }
148
149
150 /*(c rx_mk_r_binop)
151 * struct rx_exp_node * rx_mk_r_binop (enum rx_exp_node_type type,
152 * struct rx_exp_node * a,
153 * struct rx_exp_node * b);
154 *
155 * Allocate a regexp expression node with two subexpressions. The
156 * reference count is initialized to 1, the type to `type', the `left'
157 * and `right' fields to `a' and `b', and all other fields to 0.
158 *
159 * If any allocation fails, return 0.
160 *
161 * The reference counts of `a' and `b' are not modified by this
162 * function -- the caller's reference count to those nodes is taken
163 * over by this function.
164 *
165 * `type' should be `r_concat' `r_right_concat' or `r_alternate'.
166 */
167 struct rx_exp_node *
rx_mk_r_binop(enum rx_exp_node_type type,struct rx_exp_node * a,struct rx_exp_node * b)168 rx_mk_r_binop (enum rx_exp_node_type type,
169 struct rx_exp_node * a,
170 struct rx_exp_node * b)
171 {
172 struct rx_exp_node * n;
173 if ((type != r_concat) && (type != r_right_concat) && (type != r_alternate))
174 panic ("unreasonable rexp node type in rx_mk_r_binop");
175 n = rx_exp_node (type);
176 if (!n)
177 return 0;
178 n->left = a;
179 n->right = b;
180 return n;
181 }
182
183
184 /*(c rx_mk_r_monop)
185 * struct rx_exp_node * rx_mk_r_monop (enum rx_exp_node_type type,
186 * struct rx_exp_node * a);
187 *
188 * Allocate a regexp expression node with one subexpression. The
189 * reference count is initialized to 1, the type to `type', the `left'
190 * field to `a', and all other fields to 0.
191 *
192 * If any allocation fails, return 0.
193 *
194 * The reference count of `a' is not modified by this function -- the
195 * caller's reference count to that node is taken over by this
196 * function.
197 *
198 * `type' should be `r_star', `r_interval', `r_parens', or `r_context'.
199 */
200 struct rx_exp_node *
rx_mk_r_monop(enum rx_exp_node_type type,struct rx_exp_node * a)201 rx_mk_r_monop (enum rx_exp_node_type type,
202 struct rx_exp_node * a)
203 {
204 struct rx_exp_node * n;
205 if ( (type != r_star)
206 && (type != r_interval)
207 && (type != r_parens)
208 && (type != r_context))
209 panic ("unreasonable rexp node type in rx_mk_r_monop");
210 n = rx_exp_node (type);
211 if (!n)
212 return 0;
213 n->left = a;
214 n->right = 0;
215 return n;
216 }
217
218
219
220 /*(c rx_mk_r_str)
221 * struct rx_exp_node * rx_mk_r_str (enum rx_exp_node_type type,
222 * const t_uchar * s,
223 * size_t len,
224 * enum uni_encoding_scheme encoding);
225 *
226 * Allocate a regexp expression node whose parameter is a string. The
227 * reference count is initialized to 1, the type to `type', the `str'
228 * field to a newly allocated copy of the `len'-byte string `s', the
229 * `str_len' field is initialized to `len', and all other fields to 0.
230 *
231 * If any allocation fails, return 0.
232 *
233 * `type' should be `r_string'.
234 */
235 struct rx_exp_node *
rx_mk_r_str(enum rx_exp_node_type type,const t_uchar * s,size_t len,enum uni_encoding_scheme encoding)236 rx_mk_r_str (enum rx_exp_node_type type,
237 const t_uchar * s,
238 size_t len,
239 enum uni_encoding_scheme encoding)
240 {
241 struct rx_exp_node *n;
242
243 if (type != r_string)
244 panic ("unreasonable rexp node type in rx_mk_r_str_c");
245 n = rx_exp_node (type);
246 if (!n)
247 return 0;
248 n->str = (t_uchar *)rx_nfa_cache_malloc (len);
249 if (len && !n->str)
250 {
251 rx_free_exp (n);
252 return 0;
253 }
254 mem_move (n->str, s, len);
255 n->str_len = len;
256 n->encoding = encoding;
257 return n;
258 }
259
260
261 /*(c rx_mk_r_int)
262 * struct rx_exp_node * rx_mk_r_int (enum rx_exp_node_type type,
263 * int intval);
264 *
265 * Allocate a regexp expression node whose parameter is an integer.
266 * The reference count is initialized to 1, the type to `type', the
267 * `intval' field is initialized to `intval', and all other fields to
268 * 0.
269 *
270 * If any allocation fails, return 0.
271 *
272 * `type' should be `r_cut', `r_parens', or `r_context'.
273 */
274 struct rx_exp_node *
rx_mk_r_int(enum rx_exp_node_type type,int intval)275 rx_mk_r_int (enum rx_exp_node_type type,
276 int intval)
277 {
278 struct rx_exp_node * n;
279 if ( (type != r_cut)
280 && (type != r_parens)
281 && (type != r_context))
282 panic ("unreasonable rexp node type in rx_mk_r_int");
283 n = rx_exp_node (type);
284 if (!n)
285 return 0;
286 n->intval = intval;
287 return n;
288 }
289
290
291 /*(c rx_mk_r_subexp_int)
292 * struct rx_exp_node * rx_mk_r_subexp_int (enum rx_exp_node_type type,
293 * struct rx_exp_node * subexp,
294 * int intval);
295 *
296 * Allocate a regexp expression node whose parameters are a
297 * subexpression and an integer. The reference count is initialized
298 * to 1, the type to `type', the `left' field is initialized to
299 * `subexp', the `intval' field is initialized to `intval', and all
300 * other fields to 0.
301 *
302 * The reference count of `subexp' is not modified by this function --
303 * the caller's reference count to that node is taken over by this
304 * function.
305 *
306 * If any allocation fails, return 0.
307 *
308 * `type' should be `r_parens'. `intval' is the subexpression number
309 * for this parenthesized expression (the number used for
310 * backreferences). If `intval' is 0, this is an anonymous
311 * subexpression (it can not be backreferenced).
312 */
313 struct rx_exp_node *
rx_mk_r_subexp_int(enum rx_exp_node_type type,struct rx_exp_node * subexp,int intval)314 rx_mk_r_subexp_int (enum rx_exp_node_type type,
315 struct rx_exp_node * subexp,
316 int intval)
317 {
318 struct rx_exp_node * n;
319 if (type != r_parens)
320 panic ("unreasonable rexp node type in rx_mk_r_subexp_int");
321 n = rx_exp_node (type);
322 if (!n)
323 return 0;
324 n->left = subexp;
325 n->intval = intval;
326 return n;
327 }
328
329
330 /*(c rx_mk_r_int2)
331 * struct rx_exp_node * rx_mk_r_int2 (enum rx_exp_node_type type,
332 * int intval,
333 * int intval2);
334 *
335 * Allocate a regexp expression node whose parameters are two
336 * integers. The reference count is initialized to 1, the type to
337 * `type', the `intval' and `intval2' fields are initialized to `intval' and `intval2', and all
338 * other fields to 0.
339 *
340 * If any allocation fails, return 0.
341 *
342 * `type' should be `r_interval'.
343 */
344 struct rx_exp_node *
rx_mk_r_int2(enum rx_exp_node_type type,int intval,int intval2)345 rx_mk_r_int2 (enum rx_exp_node_type type,
346 int intval,
347 int intval2)
348 {
349 struct rx_exp_node * n;
350 if (type != r_interval)
351 panic ("unreasonable rexp node type in rx_mk_r_int2");
352 n = rx_exp_node (type);
353 if (!n)
354 return 0;
355 n->intval = intval;
356 n->intval2 = intval2;
357 return n;
358 }
359
360
361 /*(c rx_mk_r_subexp_int2)
362 * struct rx_exp_node * rx_mk_r_subexp_int2 (enum rx_exp_node_type type,
363 * struct rx_exp_node * subexp,
364 * int intval,
365 * int intval2);
366 *
367 * Allocate a regexp expression node whose parameters are a
368 * subexpression and two integers. The reference count is initialized
369 * to 1, the type to `type', the `left' field is initialized to
370 * `subexp', the `intval' and `intval2' fields are initialized to
371 * `intval' and `intval2', and all other fields to 0.
372 *
373 * The reference count of `subexp' is not modified by this function --
374 * the caller's reference count to that node is taken over by this
375 * function.
376 *
377 * If any allocation fails, return 0.
378 *
379 * `type' should be `r_interval'. The two integer parameters are the
380 * lower and upper bound of the iteration.
381 */
382 struct rx_exp_node *
rx_mk_r_subexp_int2(enum rx_exp_node_type type,struct rx_exp_node * subexp,int intval,int intval2)383 rx_mk_r_subexp_int2 (enum rx_exp_node_type type,
384 struct rx_exp_node * subexp,
385 int intval,
386 int intval2)
387 {
388 struct rx_exp_node * n;
389 if (type != r_interval)
390 panic ("unreasonable rexp node type in rx_mk_r_int2");
391 n = rx_exp_node (type);
392 if (!n)
393 return 0;
394 n->left = subexp;
395 n->intval = intval;
396 n->intval2 = intval2;
397 return n;
398 }
399
400
401 /****************************************************************
402 *(h1 "Regexp Tree Reference Counting and Copying.")
403 *
404 */
405
406 /*(c rx_save_exp)
407 * void rx_save_exp (struct rx_exp_node * node);
408 *
409 * Increment the reference count of `node'.
410 */
411 void
rx_save_exp(struct rx_exp_node * node)412 rx_save_exp (struct rx_exp_node * node)
413 {
414 if (node)
415 ++node->refs;
416 }
417
418
419 /*(c rx_free_exp)
420 * void rx_free_exp (struct rx_exp_node * node);
421 *
422 * Decrement the reference count of `node'. If it drops to 0, free
423 * the node, free all allocated data attached to it, and recursively
424 * `rx_free_exp' the sub-expressions of `node'.
425 */
426 void
rx_free_exp(struct rx_exp_node * node)427 rx_free_exp (struct rx_exp_node * node)
428 {
429 if (node && !--node->refs)
430 {
431 if (node->cr)
432 {
433 node->next_same_nfa->prev_same_nfa = node->prev_same_nfa;
434 node->prev_same_nfa->next_same_nfa = node->next_same_nfa;
435 }
436 if (node->cset)
437 bits_free (node->cset);
438 if (node->str)
439 rx_nfa_cache_free (node->str);
440 rx_free_exp (node->left);
441 rx_free_exp (node->right);
442 rx_free_exp (node->simplified);
443 rx_nfa_cache_free ((char *)node);
444 ++exps_freed;
445 /* safe_printfmt (1, "free %lx\n", (unsigned long)node); */
446 }
447 }
448
449
450 /****************************************************************
451 *(h1 "Regexp Tree Hashing and Equality")
452 *
453 */
454
455 /*(c rx_exp_equal)
456 * int rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b);
457 *
458 * Return 1 if two regexp expression trees are "equal", 0 otherwise.
459 *
460 * "Equal" means that the two trees have identical structure which is
461 * a sufficient but not necessary condition of the two patterns
462 * matching the same set of strings with identical results.
463 */
464 int
rx_exp_equal(struct rx_exp_node * a,struct rx_exp_node * b)465 rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b)
466 {
467 int ret;
468
469 if (a == b)
470 return 1;
471
472 if ((a == 0) || (b == 0))
473 return 0;
474
475 if (a->type != b->type)
476 return 0;
477
478 switch (a->type)
479 {
480 case r_cset:
481 ret = ( (a->cset_size == b->cset_size)
482 && bits_is_equal (a->cset, b->cset));
483 break;
484
485 case r_string:
486 ret = ( (a->str_len == b->str_len)
487 && !str_cmp_n (a->str, a->str_len, b->str, a->str_len));
488 break;
489
490 case r_cut:
491 ret = (a->intval == b->intval);
492 break;
493
494 case r_concat:
495 case r_right_concat:
496 case r_alternate:
497 ret = ( rx_exp_equal (a->left, b->left)
498 && rx_exp_equal (a->right, b->right));
499 break;
500 case r_star:
501 ret = rx_exp_equal (a->left, b->left);
502 break;
503 case r_interval:
504 ret = ( (a->intval == b->intval)
505 && (a->intval2 == b->intval2)
506 && rx_exp_equal (a->left, b->left));
507 break;
508 case r_parens:
509 ret = ( (a->intval == b->intval)
510 && rx_exp_equal (a->left, b->left));
511 break;
512
513 case r_context:
514 ret = (a->intval == b->intval);
515 break;
516 default:
517 return 0;
518 }
519 return ret;
520 }
521
522
523 static unsigned long
exp_hash(struct rx_exp_node * node,unsigned long seed)524 exp_hash (struct rx_exp_node * node, unsigned long seed)
525 {
526 unsigned long contribution;
527
528 if (!node)
529 return seed;
530
531 /* This is just made up and should be checked out. */
532
533 contribution = ( node->type
534 ^ str_hash_n (node->str, node->str_len)
535 ^ ((seed << 3) + node->intval)
536 ^ ((seed << 3) + node->intval2));
537
538 seed = contribution ^ (seed << 11) ^ (seed >> (8 * sizeof (seed) - 11));
539 seed = exp_hash (node->left, seed);
540 seed = exp_hash (node->right, seed);
541 return seed;
542 }
543
544
545 /*(c rx_exp_hash)
546 * unsigned long rx_exp_hash (struct rx_exp_node * node);
547 *
548 * Compute a word-sized hash value for an expression tree. Regexps
549 * which are equal (in the sense of `rx_exp_equal') have equal hash
550 * values (in the sense of `==').
551 */
552 unsigned long
rx_exp_hash(struct rx_exp_node * node)553 rx_exp_hash (struct rx_exp_node * node)
554 {
555 return exp_hash (node, 0);
556 }
557
558
559