1 /* match-regexp.h - low-level functions for comparing a string to a regexp
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 #ifndef INCLUDE__RX_POSIX__MATCH_REGEXP_H
12 #define INCLUDE__RX_POSIX__MATCH_REGEXP_H
13 
14 
15 
16 #include "hackerlab/machine/types.h"
17 #include "hackerlab/rx/tree.h"
18 
19 
20 
21 /* rx_off_t	An internal type used by the Posix interface as `regoff_t'.
22  *
23  * 		(`regoff_t' is required by Posix.2) Used to represent
24  * 		offsets to substrings within a string matched by
25  * 		`regexec'.  `regoff_t' is a signed arithmetic type
26  * 		that can hold the largest value that can be stored in
27  * 		either `off_t' or `long'.
28  *
29  */
30 typedef long rx_off_t;
31 
32 struct rx_registers
33 {
34   rx_off_t rm_so; 		/* Byte offset from string's start to substring's start.  */
35   rx_off_t rm_eo;  		/* Byte offset from string's start to substring's end.  */
36   int final_tag;		/* In register 0 of an array of registers, this field
37 				 * is set to the state label of the last superstate encountered
38 				 * during a match.
39 				 */
40 };
41 
42 
43 /* struct rx_context_rules
44  *
45  * An argument to `rx_basic_make_solutions' used to specify
46  * the behavior of `^', `$', and backreferences.
47  */
48 struct rx_context_rules
49 {
50   t_uchar newline_anchor;	/* If true, an anchor at a newline matches.*/
51   t_uchar not_bol;	/* If set, the anchors ('^' and '$') don't */
52   t_uchar not_eol;	/*     match at the ends of the string.  */
53   t_uchar case_indep;
54 };
55 
56 /* struct rx_solutions;
57  *
58  * A lazilly computed stream of solutions for an expression or
59  * subexpression compared to a string.
60  */
61 struct rx_solutions;
62 
63 
64 /************************************************************************
65  *(paragraphs)
66  */
67 
68 /*(c rx_vmfn :category type)
69  * typedef int (*rx_vmfn) (void * closure,
70  *			   const t_uchar ** burst,
71  *			   rx_off_t * len,
72  *			   rx_off_t * offset,
73  *			   rx_off_t start, rx_off_t end, rx_off_t need);
74  *
75  * An `rx_vmfn' is passed to `rx_make_solutions' and used by
76  * `rx_next_solution' to access the input string being compared to a
77  * regexp.  The purpose of this function is to permit the calling
78  * program to only keep part of the input string in memory, and to
79  * keep the input string in non-contiguous regions in memory.
80  *
81  * When called, `rx_vmfn' is passed:
82  *
83  * `closure' -- the opaque parameter passed to `rx_make_solutions'.
84  *
85  * `burst' -- an output parameter that will point to part of the input
86  * string.  The pointer returned in this parameter must remain valid
87  * until the next call to `rx_vmfn' or `rx_contextfn' for the same
88  * call to `rx_next_solution'.
89  *
90  * `len' -- an output parameter; the length of the string returned in
91  * `*burst'.
92  *
93  * `offset' -- an output parameter; the position of `*burst' within
94  * the input string (e.g., 0 for the beginning of the input string,
95  * 9 if `*burst' is the tenth character of the input).
96  *
97  * `start' through `end' are the input positions requested by Rx.
98  * `need' is the input position that must be returned.  `rx_vmfn' is
99  * permitted to return any substring of the input that contains
100  * `need', but the performance of Rx itself is best if returns a
101  * substring containing at least the entire range from `start' to
102  * `end'.  The precise performance implications of a particular
103  * implementation of `rx_vmfn' are application specific.
104  *
105  * Note that Rx may access parts of the string out of order and may
106  * visit the same part of the string more than once.
107  *
108  * This function should return 0 on success, and some other value on
109  * error.
110  */
111 typedef int (*rx_vmfn) (void * closure,
112 			const t_uchar ** burst,
113 			rx_off_t * len,
114 			rx_off_t * offset,
115 			rx_off_t start, rx_off_t end, rx_off_t need);
116 
117 
118 
119 /*(c rx_contextfn :category type)
120  * typedef int (*rx_contextfn) (void * closure,
121  * 			        struct rx_exp_node * node,
122  * 			        rx_off_t start, rx_off_t end,
123  * 			        struct rx_registers * regs);
124  *
125  * An `rx_contextfn' is passed to `rx_make_solutions' and used by
126  * `rx_next_solution' to access the input string being compared to a
127  * regexp.  The purpose of this function is to permit the calling
128  * program to only keep part of the input string in memory, and to
129  * keep the input string in non-contiguous regions in memory.
130  *
131  * `rx_contextfn' is responsible for evaluating subexpressions
132  * which are anchors (`^' and `$') and subexpressions which are
133  * backreferences (e.g. `\1').
134  *
135  * When called, `rx_contextfn' is passed:
136  *
137  * `closure' -- the opaque parameter passed to `rx_make_solutions'.
138  *
139  * `node' -- The regexp syntax tree node of the expression to match.
140  *
141  * `start' and `end' -- the positions within the input string (from
142  * `start' to `end-1') that must match `node'.
143  *
144  * `reg' -- subexpression position information for preceeding
145  * subexpressions.  This is used for backreferences.  Note that if a
146  * previous subexpression was not matched, its starting and ending
147  * positions will be recorded as -1.
148  *
149  * This function should return 1 if the subexpression matches, 0
150  * otherwise.
151  */
152 typedef int (*rx_contextfn) (void * closure,
153 			     struct rx_exp_node * node,
154 			     rx_off_t start, rx_off_t end,
155 			     struct rx_registers * regs);
156 
157 
158 /* automatically generated __STDC__ prototypes */
159 extern struct rx_solutions * rx_basic_make_solutions (struct rx_registers * regs,
160 						      struct rx_exp_node * expression,
161 						      struct rx_exp_node ** subexps,
162 						      int nsub,
163 						      rx_off_t start,
164 						      rx_off_t end,
165 						      struct rx_context_rules * rules,
166 						      const t_uchar * str,
167 						      int small_p);
168 extern void rx_basic_free_solutions (struct rx_solutions * solns);
169 extern int rx_next_solution (struct rx_solutions * solns);
170 extern int rx_solutions_final_tag (struct rx_solutions * solns);
171 extern struct rx_solutions * rx_make_solutions (struct rx_registers * regs,
172 						int cset_size,
173 						struct rx_exp_node * expression,
174 						struct rx_exp_node ** subexps,
175 						int nsub,
176 						rx_off_t start,
177 						rx_off_t end,
178 						int interval_x,
179 						rx_vmfn vmfn,
180 						rx_contextfn contextfn,
181 						void * closure,
182 						int small_p,
183 						int certainly_fits,
184 						int certain_final_tag);
185 extern void * rx_solutions_closure (struct rx_solutions * solns);
186 extern void rx_free_solutions (struct rx_solutions * solns);
187 extern int rx_simplify_rexp (struct rx_exp_node ** answer,
188 			     int cset_size,
189 			     struct rx_exp_node * node,
190 			     struct rx_exp_node ** subexps);
191 extern int rx_analyze_rexp (struct rx_exp_node *** subexps,
192 			    size_t * re_nsub,
193 			    struct rx_exp_node * node);
194 #endif  /* INCLUDE__RX_POSIX__MATCH_REGEXP_H */
195