1 /* 2 * 3 * Copyright (c) 1998-2002 4 * John Maddock 5 * 6 * Use, modification and distribution are subject to the 7 * Boost Software License, Version 1.0. (See accompanying file 8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 9 * 10 */ 11 12 /* 13 * LOCATION: see http://www.boost.org for most recent version. 14 * FILE states.cpp 15 * VERSION see <boost/version.hpp> 16 * DESCRIPTION: Declares internal state machine structures. 17 */ 18 19 #ifndef BOOST_REGEX_V4_STATES_HPP 20 #define BOOST_REGEX_V4_STATES_HPP 21 22 #ifdef BOOST_MSVC 23 #pragma warning(push) 24 #pragma warning(disable: 4103) 25 #endif 26 #ifdef BOOST_HAS_ABI_HEADERS 27 # include BOOST_ABI_PREFIX 28 #endif 29 #ifdef BOOST_MSVC 30 #pragma warning(pop) 31 #endif 32 33 namespace boost{ 34 namespace BOOST_REGEX_DETAIL_NS{ 35 36 /*** mask_type ******************************************************* 37 Whenever we have a choice of two alternatives, we use an array of bytes 38 to indicate which of the two alternatives it is possible to take for any 39 given input character. If mask_take is set, then we can take the next 40 state, and if mask_skip is set then we can take the alternative. 41 ***********************************************************************/ 42 enum mask_type 43 { 44 mask_take = 1, 45 mask_skip = 2, 46 mask_init = 4, 47 mask_any = mask_skip | mask_take, 48 mask_all = mask_any 49 }; 50 51 /*** helpers ********************************************************** 52 These helpers let us use function overload resolution to detect whether 53 we have narrow or wide character strings: 54 ***********************************************************************/ 55 struct _narrow_type{}; 56 struct _wide_type{}; 57 template <class charT> struct is_byte; 58 template<> struct is_byte<char> { typedef _narrow_type width_type; }; 59 template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; 60 template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; 61 template <class charT> struct is_byte { typedef _wide_type width_type; }; 62 63 /*** enum syntax_element_type ****************************************** 64 Every record in the state machine falls into one of the following types: 65 ***********************************************************************/ 66 enum syntax_element_type 67 { 68 // start of a marked sub-expression, or perl-style (?...) extension 69 syntax_element_startmark = 0, 70 // end of a marked sub-expression, or perl-style (?...) extension 71 syntax_element_endmark = syntax_element_startmark + 1, 72 // any sequence of literal characters 73 syntax_element_literal = syntax_element_endmark + 1, 74 // start of line assertion: ^ 75 syntax_element_start_line = syntax_element_literal + 1, 76 // end of line assertion $ 77 syntax_element_end_line = syntax_element_start_line + 1, 78 // match any character: . 79 syntax_element_wild = syntax_element_end_line + 1, 80 // end of expression: we have a match when we get here 81 syntax_element_match = syntax_element_wild + 1, 82 // perl style word boundary: \b 83 syntax_element_word_boundary = syntax_element_match + 1, 84 // perl style within word boundary: \B 85 syntax_element_within_word = syntax_element_word_boundary + 1, 86 // start of word assertion: \< 87 syntax_element_word_start = syntax_element_within_word + 1, 88 // end of word assertion: \> 89 syntax_element_word_end = syntax_element_word_start + 1, 90 // start of buffer assertion: \` 91 syntax_element_buffer_start = syntax_element_word_end + 1, 92 // end of buffer assertion: \' 93 syntax_element_buffer_end = syntax_element_buffer_start + 1, 94 // backreference to previously matched sub-expression 95 syntax_element_backref = syntax_element_buffer_end + 1, 96 // either a wide character set [..] or one with multicharacter collating elements: 97 syntax_element_long_set = syntax_element_backref + 1, 98 // narrow character set: [...] 99 syntax_element_set = syntax_element_long_set + 1, 100 // jump to a new state in the machine: 101 syntax_element_jump = syntax_element_set + 1, 102 // choose between two production states: 103 syntax_element_alt = syntax_element_jump + 1, 104 // a repeat 105 syntax_element_rep = syntax_element_alt + 1, 106 // match a combining character sequence 107 syntax_element_combining = syntax_element_rep + 1, 108 // perl style soft buffer end: \z 109 syntax_element_soft_buffer_end = syntax_element_combining + 1, 110 // perl style continuation: \G 111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, 112 // single character repeats: 113 syntax_element_dot_rep = syntax_element_restart_continue + 1, 114 syntax_element_char_rep = syntax_element_dot_rep + 1, 115 syntax_element_short_set_rep = syntax_element_char_rep + 1, 116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1, 117 // a backstep for lookbehind repeats: 118 syntax_element_backstep = syntax_element_long_set_rep + 1, 119 // an assertion that a mark was matched: 120 syntax_element_assert_backref = syntax_element_backstep + 1, 121 syntax_element_toggle_case = syntax_element_assert_backref + 1, 122 // a recursive expression: 123 syntax_element_recurse = syntax_element_toggle_case + 1, 124 // Verbs: 125 syntax_element_fail = syntax_element_recurse + 1, 126 syntax_element_accept = syntax_element_fail + 1, 127 syntax_element_commit = syntax_element_accept + 1, 128 syntax_element_then = syntax_element_commit + 1 129 }; 130 131 #ifdef BOOST_REGEX_DEBUG 132 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion 133 std::ostream& operator<<(std::ostream&, syntax_element_type); 134 #endif 135 136 struct re_syntax_base; 137 138 /*** union offset_type ************************************************ 139 Points to another state in the machine. During machine construction 140 we use integral offsets, but these are converted to pointers before 141 execution of the machine. 142 ***********************************************************************/ 143 union offset_type 144 { 145 re_syntax_base* p; 146 std::ptrdiff_t i; 147 }; 148 149 /*** struct re_syntax_base ******************************************** 150 Base class for all states in the machine. 151 ***********************************************************************/ 152 struct re_syntax_base 153 { 154 syntax_element_type type; // what kind of state this is 155 offset_type next; // next state in the machine 156 }; 157 158 /*** struct re_brace ************************************************** 159 A marked parenthesis. 160 ***********************************************************************/ 161 struct re_brace : public re_syntax_base 162 { 163 // The index to match, can be zero (don't mark the sub-expression) 164 // or negative (for perl style (?...) extentions): 165 int index; 166 bool icase; 167 }; 168 169 /*** struct re_dot ************************************************** 170 Match anything. 171 ***********************************************************************/ 172 enum 173 { 174 dont_care = 1, 175 force_not_newline = 0, 176 force_newline = 2, 177 178 test_not_newline = 2, 179 test_newline = 3 180 }; 181 struct re_dot : public re_syntax_base 182 { 183 unsigned char mask; 184 }; 185 186 /*** struct re_literal ************************************************ 187 A string of literals, following this structure will be an 188 array of characters: charT[length] 189 ***********************************************************************/ 190 struct re_literal : public re_syntax_base 191 { 192 unsigned int length; 193 }; 194 195 /*** struct re_case ************************************************ 196 Indicates whether we are moving to a case insensive block or not 197 ***********************************************************************/ 198 struct re_case : public re_syntax_base 199 { 200 bool icase; 201 }; 202 203 /*** struct re_set_long *********************************************** 204 A wide character set of characters, following this structure will be 205 an array of type charT: 206 First csingles null-terminated strings 207 Then 2 * cranges NULL terminated strings 208 Then cequivalents NULL terminated strings 209 ***********************************************************************/ 210 template <class mask_type> 211 struct re_set_long : public re_syntax_base 212 { 213 unsigned int csingles, cranges, cequivalents; 214 mask_type cclasses; 215 mask_type cnclasses; 216 bool isnot; 217 bool singleton; 218 }; 219 220 /*** struct re_set **************************************************** 221 A set of narrow-characters, matches any of _map which is none-zero 222 ***********************************************************************/ 223 struct re_set : public re_syntax_base 224 { 225 unsigned char _map[1 << CHAR_BIT]; 226 }; 227 228 /*** struct re_jump *************************************************** 229 Jump to a new location in the machine (not next). 230 ***********************************************************************/ 231 struct re_jump : public re_syntax_base 232 { 233 offset_type alt; // location to jump to 234 }; 235 236 /*** struct re_alt *************************************************** 237 Jump to a new location in the machine (possibly next). 238 ***********************************************************************/ 239 struct re_alt : public re_jump 240 { 241 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump 242 unsigned int can_be_null; // true if we match a NULL string 243 }; 244 245 /*** struct re_repeat ************************************************* 246 Repeat a section of the machine 247 ***********************************************************************/ 248 struct re_repeat : public re_alt 249 { 250 std::size_t min, max; // min and max allowable repeats 251 int state_id; // Unique identifier for this repeat 252 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) 253 bool greedy; // True if this is a greedy repeat 254 }; 255 256 /*** struct re_recurse ************************************************ 257 Recurse to a particular subexpression. 258 **********************************************************************/ 259 struct re_recurse : public re_jump 260 { 261 int state_id; // identifier of first nested repeat within the recursion. 262 }; 263 264 /*** struct re_commit ************************************************* 265 Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens 266 if no match is found and we start searching forward. 267 **********************************************************************/ 268 enum commit_type 269 { 270 commit_prune, 271 commit_skip, 272 commit_commit 273 }; 274 struct re_commit : public re_syntax_base 275 { 276 commit_type action; 277 }; 278 279 /*** enum re_jump_size_type ******************************************* 280 Provides compiled size of re_jump structure (allowing for trailing alignment). 281 We provide this so we know how manybytes to insert when constructing the machine 282 (The value of padding_mask is defined in regex_raw_buffer.hpp). 283 ***********************************************************************/ 284 enum re_jump_size_type 285 { 286 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), 287 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), 288 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) 289 }; 290 291 /*** proc re_is_set_member ********************************************* 292 Forward declaration: we'll need this one later... 293 ***********************************************************************/ 294 295 template<class charT, class traits> 296 struct regex_data; 297 298 template <class iterator, class charT, class traits_type, class char_classT> 299 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 300 iterator last, 301 const re_set_long<char_classT>* set_, 302 const regex_data<charT, traits_type>& e, bool icase); 303 304 } // namespace BOOST_REGEX_DETAIL_NS 305 306 } // namespace boost 307 308 #ifdef BOOST_MSVC 309 #pragma warning(push) 310 #pragma warning(disable: 4103) 311 #endif 312 #ifdef BOOST_HAS_ABI_HEADERS 313 # include BOOST_ABI_SUFFIX 314 #endif 315 #ifdef BOOST_MSVC 316 #pragma warning(pop) 317 #endif 318 319 #endif 320 321 322