1 /* tree.c - parse trees for regexps
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/char/str.h"
14 #include "hackerlab/mem/mem.h"
15 #include "hackerlab/rx/nfa-cache.h"
16 #include "hackerlab/rx/tree.h"
17 
18 
19 static unsigned long exps_allocated = 0;
20 static unsigned long exps_freed = 0;
21 
22 /****************************************************************
23  *(h0 "Regexp Expression Trees"
24  *    :subtitle "rx/rexp.c"
25  *    :includes ("rx/tree.h"))
26  *
27  *
28  * Every regexp has a natural representation as a syntax tree.  The
29  * type `struct rx_exp_node' is used to represent one node of such a
30  * tree.  Functions are provided that construct and manipulate
31  * expression trees representing regexps.
32  *
33  * Syntax trees can be constructed directly by programs as a way to
34  * specify a regexp and strings obeying standard regexp syntax can be
35  * parsed to produce syntax trees.  Regexp parsing is explained in
36  * xref:"Regexp Parsing".  The topic of this chapter is the output
37  * of parsing: the data structure used to represent a regexp syntax
38  * tree.
39  *
40  * Rx is able to compute some analytic (recursively defined)
41  * properties of regexp expression tree nodes.  Therefore, the Rx
42  * representation of expression tree nodes contains some "extra"
43  * fields to hold that data.
44  *
45  * Rx is able to translate an expression tree into an NFA and
46  * indirectly into a DFA.  The performance of this translation is
47  * commonly critical and is greatly improved by caching information
48  * in still more "extra" fields of expression nodes.
49  */
50 /*(menu)
51  */
52 
53 /*(include-documentation "tree.h")
54  */
55 
56 /****************************************************************
57  *(h1 "Regexp Tree Allocation and Initialization")
58  */
59 
60 /*(c rx_exp_node)
61  * struct rx_exp_node * rx_exp_node (int type);
62  *
63  * Allocate a new expression node and initialize its type
64  * and reference count.  The reference count is initialized to 1,
65  * the type to `type', and all other fields to 0.
66  *
67  * If the allocation fails, return 0.
68  *
69  */
70 struct rx_exp_node *
rx_exp_node(enum rx_exp_node_type type)71 rx_exp_node (enum rx_exp_node_type type)
72 {
73   struct rx_exp_node *n;
74 
75   n = (struct rx_exp_node *) rx_nfa_cache_malloc (sizeof (*n));
76   if (!n)
77     return 0;
78   ++exps_allocated;
79   mem_set0 ((char *)n, sizeof (*n));
80   n->type = type;
81   n->refs = 1;
82   return n;
83 }
84 
85 
86 /*(c rx_mk_r_cset)
87  * struct rx_exp_node * rx_mk_r_cset (enum rx_exp_node_type type,
88  *				      int size, bits b);
89  *
90  * Allocate a new character set expression node.  The reference count
91  * is initialized to 1, the type to `type', the `cset' field to a
92  * newly allocated copy of `b', the `cset_size' field to `size', and
93  * all other fields to 0.
94  *
95  * If any allocation fails, return 0;
96  *
97  * `type' should be `r_cset'.
98  */
99 struct rx_exp_node *
rx_mk_r_cset(enum rx_exp_node_type type,int size,bits b)100 rx_mk_r_cset (enum rx_exp_node_type type, int size, bits b)
101 {
102   struct rx_exp_node * n;
103 
104   if (type != r_cset)
105     panic ("unreasonable rexp node type in rx_mk_r_cset");
106 
107   n = rx_exp_node (type);
108   if (!n)
109     return 0;
110   n->cset = bits_dup (b);
111   if (!n->cset)
112     {
113       rx_free_exp (n);
114       return 0;
115     }
116   n->cset_size = size;
117   return n;
118 }
119 
120 
121 /*(c rx_mk_r_cset_take)
122  * struct rx_exp_node * rx_mk_r_cset_take (enum rx_exp_node_type type,
123  *				      int size, bits b);
124  *
125  * Allocate a new character set expression node.  The reference count
126  * is initialized to 1, the type to `type', the `cset' field to a
127  * newly allocated copy of `b', the `cset_size' field to `size', and
128  * all other fields to 0.
129  *
130  * If any allocation fails, return 0.
131  *
132  * `type' should be `r_cset'.
133  */
134 struct rx_exp_node *
rx_mk_r_cset_take(enum rx_exp_node_type type,int size,bits b)135 rx_mk_r_cset_take (enum rx_exp_node_type type, int size, bits b)
136 {
137   struct rx_exp_node * n;
138 
139   if (type != r_cset)
140     panic ("unreasonable rexp node type in rx_mk_r_cset_take");
141   n = rx_exp_node (type);
142   if (!n)
143     return 0;
144   n->cset = b;
145   n->cset_size = size;
146   return n;
147 }
148 
149 
150 /*(c rx_mk_r_binop)
151  *  struct rx_exp_node * rx_mk_r_binop (enum rx_exp_node_type type,
152  *                                      struct rx_exp_node * a,
153  *                                      struct rx_exp_node * b);
154  *
155  * Allocate a regexp expression node with two subexpressions.  The
156  * reference count is initialized to 1, the type to `type', the `left'
157  * and `right' fields to `a' and `b', and all other fields to 0.
158  *
159  * If any allocation fails, return 0.
160  *
161  * The reference counts of `a' and `b' are not modified by this
162  * function -- the caller's reference count to those nodes is taken
163  * over by this function.
164  *
165  * `type' should be `r_concat' `r_right_concat' or `r_alternate'.
166  */
167 struct rx_exp_node *
rx_mk_r_binop(enum rx_exp_node_type type,struct rx_exp_node * a,struct rx_exp_node * b)168 rx_mk_r_binop (enum rx_exp_node_type type,
169 	       struct rx_exp_node * a,
170 	       struct rx_exp_node * b)
171 {
172   struct rx_exp_node * n;
173   if ((type != r_concat) && (type != r_right_concat) && (type != r_alternate))
174     panic ("unreasonable rexp node type in rx_mk_r_binop");
175   n = rx_exp_node (type);
176   if (!n)
177     return 0;
178   n->left = a;
179   n->right = b;
180   return n;
181 }
182 
183 
184 /*(c rx_mk_r_monop)
185  * struct rx_exp_node * rx_mk_r_monop (enum rx_exp_node_type type,
186  *                                     struct rx_exp_node * a);
187  *
188  * Allocate a regexp expression node with one subexpression.  The
189  * reference count is initialized to 1, the type to `type', the `left'
190  * field to `a', and all other fields to 0.
191  *
192  * If any allocation fails, return 0.
193  *
194  * The reference count of `a' is not modified by this function -- the
195  * caller's reference count to that node is taken over by this
196  * function.
197  *
198  * `type' should be `r_star', `r_interval', `r_parens', or `r_context'.
199  */
200 struct rx_exp_node *
rx_mk_r_monop(enum rx_exp_node_type type,struct rx_exp_node * a)201 rx_mk_r_monop (enum rx_exp_node_type type,
202 	       struct rx_exp_node * a)
203 {
204   struct rx_exp_node * n;
205   if (   (type != r_star)
206       && (type != r_interval)
207       && (type != r_parens)
208       && (type != r_context))
209     panic ("unreasonable rexp node type in rx_mk_r_monop");
210   n = rx_exp_node (type);
211   if (!n)
212     return 0;
213   n->left = a;
214   n->right = 0;
215   return n;
216 }
217 
218 
219 
220 /*(c rx_mk_r_str)
221  * struct rx_exp_node * rx_mk_r_str (enum rx_exp_node_type type,
222  *                                   const t_uchar * s,
223  *                                   size_t len,
224  *				     enum uni_encoding_scheme encoding);
225  *
226  * Allocate a regexp expression node whose parameter is a string.  The
227  * reference count is initialized to 1, the type to `type', the `str'
228  * field to a newly allocated copy of the `len'-byte string `s', the
229  * `str_len' field is initialized to `len', and all other fields to 0.
230  *
231  * If any allocation fails, return 0.
232  *
233  * `type' should be `r_string'.
234  */
235 struct rx_exp_node *
rx_mk_r_str(enum rx_exp_node_type type,const t_uchar * s,size_t len,enum uni_encoding_scheme encoding)236 rx_mk_r_str (enum rx_exp_node_type type,
237 	     const t_uchar * s,
238 	     size_t len,
239 	     enum uni_encoding_scheme encoding)
240 {
241   struct rx_exp_node *n;
242 
243   if (type != r_string)
244     panic ("unreasonable rexp node type in rx_mk_r_str_c");
245   n = rx_exp_node (type);
246   if (!n)
247     return 0;
248   n->str = (t_uchar *)rx_nfa_cache_malloc (len);
249   if (len && !n->str)
250     {
251       rx_free_exp (n);
252       return 0;
253     }
254   mem_move (n->str, s, len);
255   n->str_len = len;
256   n->encoding = encoding;
257   return n;
258 }
259 
260 
261 /*(c rx_mk_r_int)
262  * struct rx_exp_node * rx_mk_r_int (enum rx_exp_node_type type,
263  *                                   int intval);
264  *
265  * Allocate a regexp expression node whose parameter is an integer.
266  * The reference count is initialized to 1, the type to `type', the
267  * `intval' field is initialized to `intval', and all other fields to
268  * 0.
269  *
270  * If any allocation fails, return 0.
271  *
272  * `type' should be `r_cut', `r_parens', or `r_context'.
273  */
274 struct rx_exp_node *
rx_mk_r_int(enum rx_exp_node_type type,int intval)275 rx_mk_r_int (enum rx_exp_node_type type,
276 	     int intval)
277 {
278   struct rx_exp_node * n;
279   if (   (type != r_cut)
280       && (type != r_parens)
281       && (type != r_context))
282     panic ("unreasonable rexp node type in rx_mk_r_int");
283   n = rx_exp_node (type);
284   if (!n)
285     return 0;
286   n->intval = intval;
287   return n;
288 }
289 
290 
291 /*(c rx_mk_r_subexp_int)
292  * struct rx_exp_node * rx_mk_r_subexp_int (enum rx_exp_node_type type,
293  *				            struct rx_exp_node * subexp,
294  *                                          int intval);
295  *
296  * Allocate a regexp expression node whose parameters are a
297  * subexpression and an integer.  The reference count is initialized
298  * to 1, the type to `type', the `left' field is initialized to
299  * `subexp', the `intval' field is initialized to `intval', and all
300  * other fields to 0.
301  *
302  * The reference count of `subexp' is not modified by this function --
303  * the caller's reference count to that node is taken over by this
304  * function.
305  *
306  * If any allocation fails, return 0.
307  *
308  * `type' should be `r_parens'.  `intval' is the subexpression number
309  * for this parenthesized expression (the number used for
310  * backreferences).  If `intval' is 0, this is an anonymous
311  * subexpression (it can not be backreferenced).
312  */
313 struct rx_exp_node *
rx_mk_r_subexp_int(enum rx_exp_node_type type,struct rx_exp_node * subexp,int intval)314 rx_mk_r_subexp_int (enum rx_exp_node_type type,
315 		    struct rx_exp_node * subexp,
316 		    int intval)
317 {
318   struct rx_exp_node * n;
319   if (type != r_parens)
320     panic ("unreasonable rexp node type in rx_mk_r_subexp_int");
321   n = rx_exp_node (type);
322   if (!n)
323     return 0;
324   n->left = subexp;
325   n->intval = intval;
326   return n;
327 }
328 
329 
330 /*(c rx_mk_r_int2)
331  * struct rx_exp_node * rx_mk_r_int2 (enum rx_exp_node_type type,
332  *                                    int intval,
333  *                                    int intval2);
334  *
335  * Allocate a regexp expression node whose parameters are two
336  * integers.  The reference count is initialized to 1, the type to
337  * `type', the `intval' and `intval2' fields are initialized to `intval' and `intval2', and all
338  * other fields to 0.
339  *
340  * If any allocation fails, return 0.
341  *
342  * `type' should be `r_interval'.
343  */
344 struct rx_exp_node *
rx_mk_r_int2(enum rx_exp_node_type type,int intval,int intval2)345 rx_mk_r_int2 (enum rx_exp_node_type type,
346 	      int intval,
347 	      int intval2)
348 {
349   struct rx_exp_node * n;
350   if (type != r_interval)
351     panic ("unreasonable rexp node type in rx_mk_r_int2");
352   n = rx_exp_node (type);
353   if (!n)
354     return 0;
355   n->intval = intval;
356   n->intval2 = intval2;
357   return n;
358 }
359 
360 
361 /*(c rx_mk_r_subexp_int2)
362  * struct rx_exp_node * rx_mk_r_subexp_int2 (enum rx_exp_node_type type,
363  * 					     struct rx_exp_node * subexp,
364  *                                           int intval,
365  *                                           int intval2);
366  *
367  * Allocate a regexp expression node whose parameters are a
368  * subexpression and two integers.  The reference count is initialized
369  * to 1, the type to `type', the `left' field is initialized to
370  * `subexp', the `intval' and `intval2' fields are initialized to
371  * `intval' and `intval2', and all other fields to 0.
372  *
373  * The reference count of `subexp' is not modified by this function --
374  * the caller's reference count to that node is taken over by this
375  * function.
376  *
377  * If any allocation fails, return 0.
378  *
379  * `type' should be `r_interval'.  The two integer parameters are the
380  * lower and upper bound of the iteration.
381  */
382 struct rx_exp_node *
rx_mk_r_subexp_int2(enum rx_exp_node_type type,struct rx_exp_node * subexp,int intval,int intval2)383 rx_mk_r_subexp_int2 (enum rx_exp_node_type type,
384 		     struct rx_exp_node * subexp,
385 		     int intval,
386 		     int intval2)
387 {
388   struct rx_exp_node * n;
389   if (type != r_interval)
390     panic ("unreasonable rexp node type in rx_mk_r_int2");
391   n = rx_exp_node (type);
392   if (!n)
393     return 0;
394   n->left = subexp;
395   n->intval = intval;
396   n->intval2 = intval2;
397   return n;
398 }
399 
400 
401 /****************************************************************
402  *(h1 "Regexp Tree Reference Counting and Copying.")
403  *
404  */
405 
406 /*(c rx_save_exp)
407  * void rx_save_exp (struct rx_exp_node * node);
408  *
409  * Increment the reference count of `node'.
410  */
411 void
rx_save_exp(struct rx_exp_node * node)412 rx_save_exp (struct rx_exp_node * node)
413 {
414   if (node)
415     ++node->refs;
416 }
417 
418 
419 /*(c rx_free_exp)
420  * void rx_free_exp (struct rx_exp_node * node);
421  *
422  * Decrement the reference count of `node'.  If it drops to 0, free
423  * the node, free all allocated data attached to it, and recursively
424  * `rx_free_exp' the sub-expressions of `node'.
425  */
426 void
rx_free_exp(struct rx_exp_node * node)427 rx_free_exp (struct rx_exp_node * node)
428 {
429   if (node && !--node->refs)
430     {
431       if (node->cr)
432 	{
433 	  node->next_same_nfa->prev_same_nfa = node->prev_same_nfa;
434 	  node->prev_same_nfa->next_same_nfa = node->next_same_nfa;
435 	}
436       if (node->cset)
437 	bits_free (node->cset);
438       if (node->str)
439 	rx_nfa_cache_free (node->str);
440       rx_free_exp (node->left);
441       rx_free_exp (node->right);
442       rx_free_exp (node->simplified);
443       rx_nfa_cache_free ((char *)node);
444       ++exps_freed;
445       /* safe_printfmt (1, "free %lx\n", (unsigned long)node); */
446     }
447 }
448 
449 
450 /****************************************************************
451  *(h1 "Regexp Tree Hashing and Equality")
452  *
453  */
454 
455 /*(c rx_exp_equal)
456  * int rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b);
457  *
458  * Return 1 if two regexp expression trees are "equal", 0 otherwise.
459  *
460  * "Equal" means that the two trees have identical structure which is
461  * a sufficient but not necessary condition of the two patterns
462  * matching the same set of strings with identical results.
463  */
464 int
rx_exp_equal(struct rx_exp_node * a,struct rx_exp_node * b)465 rx_exp_equal (struct rx_exp_node * a, struct rx_exp_node * b)
466 {
467   int ret;
468 
469   if (a == b)
470     return 1;
471 
472   if ((a == 0) || (b == 0))
473     return 0;
474 
475   if (a->type != b->type)
476     return 0;
477 
478   switch (a->type)
479     {
480     case r_cset:
481       ret = (   (a->cset_size == b->cset_size)
482 	     && bits_is_equal (a->cset, b->cset));
483       break;
484 
485     case r_string:
486       ret = (   (a->str_len == b->str_len)
487 	     && !str_cmp_n (a->str, a->str_len, b->str, a->str_len));
488       break;
489 
490     case r_cut:
491       ret = (a->intval == b->intval);
492       break;
493 
494     case r_concat:
495     case r_right_concat:
496     case r_alternate:
497       ret = (   rx_exp_equal (a->left, b->left)
498 	     && rx_exp_equal (a->right, b->right));
499       break;
500     case r_star:
501       ret = rx_exp_equal (a->left, b->left);
502       break;
503     case r_interval:
504       ret = (   (a->intval == b->intval)
505 	     && (a->intval2 == b->intval2)
506 	     && rx_exp_equal (a->left, b->left));
507       break;
508     case r_parens:
509       ret = (   (a->intval == b->intval)
510 	     && rx_exp_equal (a->left, b->left));
511       break;
512 
513     case r_context:
514       ret = (a->intval == b->intval);
515       break;
516     default:
517       return 0;
518     }
519   return ret;
520 }
521 
522 
523 static unsigned long
exp_hash(struct rx_exp_node * node,unsigned long seed)524 exp_hash (struct rx_exp_node * node, unsigned long seed)
525 {
526   unsigned long contribution;
527 
528   if (!node)
529     return seed;
530 
531   /* This is just made up and should be checked out. */
532 
533   contribution = (  node->type
534 		  ^ str_hash_n (node->str, node->str_len)
535 		  ^ ((seed << 3) + node->intval)
536 		  ^ ((seed << 3) + node->intval2));
537 
538   seed = contribution ^ (seed << 11) ^ (seed >> (8 * sizeof (seed) - 11));
539   seed = exp_hash (node->left, seed);
540   seed = exp_hash (node->right, seed);
541   return seed;
542 }
543 
544 
545 /*(c rx_exp_hash)
546  * unsigned long rx_exp_hash (struct rx_exp_node * node);
547  *
548  * Compute a word-sized hash value for an expression tree.  Regexps
549  * which are equal (in the sense of `rx_exp_equal') have equal hash
550  * values (in the sense of `==').
551  */
552 unsigned long
rx_exp_hash(struct rx_exp_node * node)553 rx_exp_hash (struct rx_exp_node * node)
554 {
555   return exp_hash (node, 0);
556 }
557 
558 
559