1 /*
2  *
3  * Copyright (c) 1998-2002
4  * John Maddock
5  *
6  * Use, modification and distribution are subject to the
7  * Boost Software License, Version 1.0. (See accompanying file
8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9  *
10  */
11 
12  /*
13   *   LOCATION:    see http://www.boost.org for most recent version.
14   *   FILE         states.cpp
15   *   VERSION      see <boost/version.hpp>
16   *   DESCRIPTION: Declares internal state machine structures.
17   */
18 
19 #ifndef BOOST_REGEX_V4_STATES_HPP
20 #define BOOST_REGEX_V4_STATES_HPP
21 
22 #ifdef BOOST_MSVC
23 #pragma warning(push)
24 #pragma warning(disable: 4103)
25 #endif
26 #ifdef BOOST_HAS_ABI_HEADERS
27 #  include BOOST_ABI_PREFIX
28 #endif
29 #ifdef BOOST_MSVC
30 #pragma warning(pop)
31 #endif
32 
33 namespace boost{
34 namespace BOOST_REGEX_DETAIL_NS{
35 
36 /*** mask_type *******************************************************
37 Whenever we have a choice of two alternatives, we use an array of bytes
38 to indicate which of the two alternatives it is possible to take for any
39 given input character.  If mask_take is set, then we can take the next
40 state, and if mask_skip is set then we can take the alternative.
41 ***********************************************************************/
42 enum mask_type
43 {
44    mask_take = 1,
45    mask_skip = 2,
46    mask_init = 4,
47    mask_any = mask_skip | mask_take,
48    mask_all = mask_any
49 };
50 
51 /*** helpers **********************************************************
52 These helpers let us use function overload resolution to detect whether
53 we have narrow or wide character strings:
54 ***********************************************************************/
55 struct _narrow_type{};
56 struct _wide_type{};
57 template <class charT> struct is_byte;
58 template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
59 template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
60 template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
61 template <class charT> struct is_byte               { typedef _wide_type width_type; };
62 
63 /*** enum syntax_element_type ******************************************
64 Every record in the state machine falls into one of the following types:
65 ***********************************************************************/
66 enum syntax_element_type
67 {
68    // start of a marked sub-expression, or perl-style (?...) extension
69    syntax_element_startmark = 0,
70    // end of a marked sub-expression, or perl-style (?...) extension
71    syntax_element_endmark = syntax_element_startmark + 1,
72    // any sequence of literal characters
73    syntax_element_literal = syntax_element_endmark + 1,
74    // start of line assertion: ^
75    syntax_element_start_line = syntax_element_literal + 1,
76    // end of line assertion $
77    syntax_element_end_line = syntax_element_start_line + 1,
78    // match any character: .
79    syntax_element_wild = syntax_element_end_line + 1,
80    // end of expression: we have a match when we get here
81    syntax_element_match = syntax_element_wild + 1,
82    // perl style word boundary: \b
83    syntax_element_word_boundary = syntax_element_match + 1,
84    // perl style within word boundary: \B
85    syntax_element_within_word = syntax_element_word_boundary + 1,
86    // start of word assertion: \<
87    syntax_element_word_start = syntax_element_within_word + 1,
88    // end of word assertion: \>
89    syntax_element_word_end = syntax_element_word_start + 1,
90    // start of buffer assertion: \`
91    syntax_element_buffer_start = syntax_element_word_end + 1,
92    // end of buffer assertion: \'
93    syntax_element_buffer_end = syntax_element_buffer_start + 1,
94    // backreference to previously matched sub-expression
95    syntax_element_backref = syntax_element_buffer_end + 1,
96    // either a wide character set [..] or one with multicharacter collating elements:
97    syntax_element_long_set = syntax_element_backref + 1,
98    // narrow character set: [...]
99    syntax_element_set = syntax_element_long_set + 1,
100    // jump to a new state in the machine:
101    syntax_element_jump = syntax_element_set + 1,
102    // choose between two production states:
103    syntax_element_alt = syntax_element_jump + 1,
104    // a repeat
105    syntax_element_rep = syntax_element_alt + 1,
106    // match a combining character sequence
107    syntax_element_combining = syntax_element_rep + 1,
108    // perl style soft buffer end: \z
109    syntax_element_soft_buffer_end = syntax_element_combining + 1,
110    // perl style continuation: \G
111    syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
112    // single character repeats:
113    syntax_element_dot_rep = syntax_element_restart_continue + 1,
114    syntax_element_char_rep = syntax_element_dot_rep + 1,
115    syntax_element_short_set_rep = syntax_element_char_rep + 1,
116    syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
117    // a backstep for lookbehind repeats:
118    syntax_element_backstep = syntax_element_long_set_rep + 1,
119    // an assertion that a mark was matched:
120    syntax_element_assert_backref = syntax_element_backstep + 1,
121    syntax_element_toggle_case = syntax_element_assert_backref + 1,
122    // a recursive expression:
123    syntax_element_recurse = syntax_element_toggle_case + 1,
124    // Verbs:
125    syntax_element_fail = syntax_element_recurse + 1,
126    syntax_element_accept = syntax_element_fail + 1,
127    syntax_element_commit = syntax_element_accept + 1,
128    syntax_element_then = syntax_element_commit + 1
129 };
130 
131 #ifdef BOOST_REGEX_DEBUG
132 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
133 std::ostream& operator<<(std::ostream&, syntax_element_type);
134 #endif
135 
136 struct re_syntax_base;
137 
138 /*** union offset_type ************************************************
139 Points to another state in the machine.  During machine construction
140 we use integral offsets, but these are converted to pointers before
141 execution of the machine.
142 ***********************************************************************/
143 union offset_type
144 {
145    re_syntax_base*   p;
146    std::ptrdiff_t    i;
147 };
148 
149 /*** struct re_syntax_base ********************************************
150 Base class for all states in the machine.
151 ***********************************************************************/
152 struct re_syntax_base
153 {
154    syntax_element_type   type;         // what kind of state this is
155    offset_type           next;         // next state in the machine
156 };
157 
158 /*** struct re_brace **************************************************
159 A marked parenthesis.
160 ***********************************************************************/
161 struct re_brace : public re_syntax_base
162 {
163    // The index to match, can be zero (don't mark the sub-expression)
164    // or negative (for perl style (?...) extentions):
165    int index;
166    bool icase;
167 };
168 
169 /*** struct re_dot **************************************************
170 Match anything.
171 ***********************************************************************/
172 enum
173 {
174    dont_care = 1,
175    force_not_newline = 0,
176    force_newline = 2,
177 
178    test_not_newline = 2,
179    test_newline = 3
180 };
181 struct re_dot : public re_syntax_base
182 {
183    unsigned char mask;
184 };
185 
186 /*** struct re_literal ************************************************
187 A string of literals, following this structure will be an
188 array of characters: charT[length]
189 ***********************************************************************/
190 struct re_literal : public re_syntax_base
191 {
192    unsigned int length;
193 };
194 
195 /*** struct re_case ************************************************
196 Indicates whether we are moving to a case insensive block or not
197 ***********************************************************************/
198 struct re_case : public re_syntax_base
199 {
200    bool icase;
201 };
202 
203 /*** struct re_set_long ***********************************************
204 A wide character set of characters, following this structure will be
205 an array of type charT:
206 First csingles null-terminated strings
207 Then 2 * cranges NULL terminated strings
208 Then cequivalents NULL terminated strings
209 ***********************************************************************/
210 template <class mask_type>
211 struct re_set_long : public re_syntax_base
212 {
213    unsigned int            csingles, cranges, cequivalents;
214    mask_type               cclasses;
215    mask_type               cnclasses;
216    bool                    isnot;
217    bool                    singleton;
218 };
219 
220 /*** struct re_set ****************************************************
221 A set of narrow-characters, matches any of _map which is none-zero
222 ***********************************************************************/
223 struct re_set : public re_syntax_base
224 {
225    unsigned char _map[1 << CHAR_BIT];
226 };
227 
228 /*** struct re_jump ***************************************************
229 Jump to a new location in the machine (not next).
230 ***********************************************************************/
231 struct re_jump : public re_syntax_base
232 {
233    offset_type     alt;                 // location to jump to
234 };
235 
236 /*** struct re_alt ***************************************************
237 Jump to a new location in the machine (possibly next).
238 ***********************************************************************/
239 struct re_alt : public re_jump
240 {
241    unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
242    unsigned int    can_be_null;         // true if we match a NULL string
243 };
244 
245 /*** struct re_repeat *************************************************
246 Repeat a section of the machine
247 ***********************************************************************/
248 struct re_repeat : public re_alt
249 {
250    std::size_t   min, max;  // min and max allowable repeats
251    int           state_id;        // Unique identifier for this repeat
252    bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
253    bool          greedy;    // True if this is a greedy repeat
254 };
255 
256 /*** struct re_recurse ************************************************
257 Recurse to a particular subexpression.
258 **********************************************************************/
259 struct re_recurse : public re_jump
260 {
261    int state_id;             // identifier of first nested repeat within the recursion.
262 };
263 
264 /*** struct re_commit *************************************************
265 Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
266 if no match is found and we start searching forward.
267 **********************************************************************/
268 enum commit_type
269 {
270    commit_prune,
271    commit_skip,
272    commit_commit
273 };
274 struct re_commit : public re_syntax_base
275 {
276    commit_type action;
277 };
278 
279 /*** enum re_jump_size_type *******************************************
280 Provides compiled size of re_jump structure (allowing for trailing alignment).
281 We provide this so we know how manybytes to insert when constructing the machine
282 (The value of padding_mask is defined in regex_raw_buffer.hpp).
283 ***********************************************************************/
284 enum re_jump_size_type
285 {
286    re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
287    re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
288    re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
289 };
290 
291 /*** proc re_is_set_member *********************************************
292 Forward declaration: we'll need this one later...
293 ***********************************************************************/
294 
295 template<class charT, class traits>
296 struct regex_data;
297 
298 template <class iterator, class charT, class traits_type, class char_classT>
299 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
300                           iterator last,
301                           const re_set_long<char_classT>* set_,
302                           const regex_data<charT, traits_type>& e, bool icase);
303 
304 } // namespace BOOST_REGEX_DETAIL_NS
305 
306 } // namespace boost
307 
308 #ifdef BOOST_MSVC
309 #pragma warning(push)
310 #pragma warning(disable: 4103)
311 #endif
312 #ifdef BOOST_HAS_ABI_HEADERS
313 #  include BOOST_ABI_SUFFIX
314 #endif
315 #ifdef BOOST_MSVC
316 #pragma warning(pop)
317 #endif
318 
319 #endif
320 
321 
322