1 // class template regex -*- C++ -*-
2
3 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24
25 /**
26 * @file bits/regex_nfa.h
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
29 */
30
_GLIBCXX_VISIBILITY(default)31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 namespace __regex
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36
37 // Base class for, um, automata. Could be an NFA or a DFA. Your choice.
38 class _Automaton
39 {
40 public:
41 typedef unsigned int _SizeT;
42
43 public:
44 virtual
45 ~_Automaton() { }
46
47 virtual _SizeT
48 _M_sub_count() const = 0;
49
50 #ifdef _GLIBCXX_DEBUG
51 virtual std::ostream&
52 _M_dot(std::ostream& __ostr) const = 0;
53 #endif
54 };
55
56 // Generic shared pointer to an automaton.
57 typedef std::shared_ptr<_Automaton> _AutomatonPtr;
58
59 // Operation codes that define the type of transitions within the base NFA
60 // that represents the regular expression.
61 enum _Opcode
62 {
63 _S_opcode_unknown = 0,
64 _S_opcode_alternative = 1,
65 _S_opcode_subexpr_begin = 4,
66 _S_opcode_subexpr_end = 5,
67 _S_opcode_match = 100,
68 _S_opcode_accept = 255
69 };
70
71 // Provides a generic facade for a templated match_results.
72 struct _Results
73 {
74 virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
75 virtual void _M_set_matched(int __i, bool __is_matched) = 0;
76 };
77
78 // Tags current state (for subexpr begin/end).
79 typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
80
81 template<typename _FwdIterT, typename _TraitsT>
82 struct _StartTagger
83 {
84 explicit
85 _StartTagger(int __i)
86 : _M_index(__i)
87 { }
88
89 void
90 operator()(const _PatternCursor& __pc, _Results& __r)
91 { __r._M_set_pos(_M_index, 0, __pc); }
92
93 int _M_index;
94 };
95
96 template<typename _FwdIterT, typename _TraitsT>
97 struct _EndTagger
98 {
99 explicit
100 _EndTagger(int __i)
101 : _M_index(__i)
102 { }
103
104 void
105 operator()(const _PatternCursor& __pc, _Results& __r)
106 { __r._M_set_pos(_M_index, 1, __pc); }
107
108 int _M_index;
109 _FwdIterT _M_pos;
110 };
111 // Indicates if current state matches cursor current.
112 typedef std::function<bool (const _PatternCursor&)> _Matcher;
113
114 // Matches any character
115 inline bool
116 _AnyMatcher(const _PatternCursor&)
117 { return true; }
118
119 // Matches a single character
120 template<typename _InIterT, typename _TraitsT>
121 struct _CharMatcher
122 {
123 typedef typename _TraitsT::char_type char_type;
124
125 explicit
126 _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
127 : _M_traits(__t), _M_c(_M_traits.translate(__c))
128 { }
129
130 bool
131 operator()(const _PatternCursor& __pc) const
132 {
133 typedef const _SpecializedCursor<_InIterT>& _CursorT;
134 _CursorT __c = static_cast<_CursorT>(__pc);
135 return _M_traits.translate(__c._M_current()) == _M_c;
136 }
137
138 const _TraitsT& _M_traits;
139 char_type _M_c;
140 };
141
142 // Matches a character range (bracket expression)
143 template<typename _InIterT, typename _TraitsT>
144 struct _RangeMatcher
145 {
146 typedef typename _TraitsT::char_type _CharT;
147 typedef std::basic_string<_CharT> _StringT;
148
149 explicit
150 _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
151 : _M_traits(__t), _M_is_non_matching(__is_non_matching)
152 { }
153
154 bool
155 operator()(const _PatternCursor& __pc) const
156 {
157 typedef const _SpecializedCursor<_InIterT>& _CursorT;
158 _CursorT __c = static_cast<_CursorT>(__pc);
159 return true;
160 }
161
162 void
163 _M_add_char(_CharT __c)
164 { }
165
166 void
167 _M_add_collating_element(const _StringT& __s)
168 { }
169
170 void
171 _M_add_equivalence_class(const _StringT& __s)
172 { }
173
174 void
175 _M_add_character_class(const _StringT& __s)
176 { }
177
178 void
179 _M_make_range()
180 { }
181
182 const _TraitsT& _M_traits;
183 bool _M_is_non_matching;
184 };
185
186 // Identifies a state in the NFA.
187 typedef int _StateIdT;
188
189 // The special case in which a state identifier is not an index.
190 static const _StateIdT _S_invalid_state_id = -1;
191
192
193 // An individual state in an NFA
194 //
195 // In this case a "state" is an entry in the NFA definition coupled with its
196 // outgoing transition(s). All states have a single outgoing transition,
197 // except for accepting states (which have no outgoing transitions) and alt
198 // states, which have two outgoing transitions.
199 //
200 struct _State
201 {
202 typedef int _OpcodeT;
203
204 _OpcodeT _M_opcode; // type of outgoing transition
205 _StateIdT _M_next; // outgoing transition
206 _StateIdT _M_alt; // for _S_opcode_alternative
207 unsigned int _M_subexpr; // for _S_opcode_subexpr_*
208 _Tagger _M_tagger; // for _S_opcode_subexpr_*
209 _Matcher _M_matches; // for _S_opcode_match
210
211 explicit _State(_OpcodeT __opcode)
212 : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
213 { }
214
215 _State(const _Matcher& __m)
216 : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
217 { }
218
219 _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
220 : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
221 _M_tagger(__t)
222 { }
223
224 _State(_StateIdT __next, _StateIdT __alt)
225 : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
226 { }
227
228 #ifdef _GLIBCXX_DEBUG
229 std::ostream&
230 _M_print(std::ostream& ostr) const;
231
232 // Prints graphviz dot commands for state.
233 std::ostream&
234 _M_dot(std::ostream& __ostr, _StateIdT __id) const;
235 #endif
236 };
237
238
239 // The Grep Matcher works on sets of states. Here are sets of states.
240 typedef std::set<_StateIdT> _StateSet;
241
242 // A collection of all states making up an NFA
243 //
244 // An NFA is a 4-tuple M = (K, S, s, F), where
245 // K is a finite set of states,
246 // S is the alphabet of the NFA,
247 // s is the initial state,
248 // F is a set of final (accepting) states.
249 //
250 // This NFA class is templated on S, a type that will hold values of the
251 // underlying alphabet (without regard to semantics of that alphabet). The
252 // other elements of the tuple are generated during construction of the NFA
253 // and are available through accessor member functions.
254 //
255 class _Nfa
256 : public _Automaton, public std::vector<_State>
257 {
258 public:
259 typedef _State _StateT;
260 typedef unsigned int _SizeT;
261 typedef regex_constants::syntax_option_type _FlagT;
262
263 public:
264 _Nfa(_FlagT __f)
265 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
266 { }
267
268 ~_Nfa()
269 { }
270
271 _FlagT
272 _M_options() const
273 { return _M_flags; }
274
275 _StateIdT
276 _M_start() const
277 { return _M_start_state; }
278
279 const _StateSet&
280 _M_final_states() const
281 { return _M_accepting_states; }
282
283 _SizeT
284 _M_sub_count() const
285 { return _M_subexpr_count; }
286
287 _StateIdT
288 _M_insert_accept()
289 {
290 this->push_back(_StateT(_S_opcode_accept));
291 _M_accepting_states.insert(this->size()-1);
292 return this->size()-1;
293 }
294
295 _StateIdT
296 _M_insert_alt(_StateIdT __next, _StateIdT __alt)
297 {
298 this->push_back(_StateT(__next, __alt));
299 return this->size()-1;
300 }
301
302 _StateIdT
303 _M_insert_matcher(_Matcher __m)
304 {
305 this->push_back(_StateT(__m));
306 return this->size()-1;
307 }
308
309 _StateIdT
310 _M_insert_subexpr_begin(const _Tagger& __t)
311 {
312 this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
313 return this->size()-1;
314 }
315
316 _StateIdT
317 _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
318 {
319 this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
320 return this->size()-1;
321 }
322
323 #ifdef _GLIBCXX_DEBUG
324 std::ostream&
325 _M_dot(std::ostream& __ostr) const;
326 #endif
327
328 private:
329 _FlagT _M_flags;
330 _StateIdT _M_start_state;
331 _StateSet _M_accepting_states;
332 _SizeT _M_subexpr_count;
333 };
334
335 // Describes a sequence of one or more %_State, its current start and end(s).
336 //
337 // This structure contains fragments of an NFA during construction.
338 class _StateSeq
339 {
340 public:
341 // Constructs a single-node sequence
342 _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
343 : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
344 { }
345 // Constructs a split sequence from two other sequencces
346 _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
347 : _M_nfa(__e1._M_nfa),
348 _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
349 _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
350 { }
351
352 // Constructs a split sequence from a single sequence
353 _StateSeq(const _StateSeq& __e, _StateIdT __id)
354 : _M_nfa(__e._M_nfa),
355 _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
356 _M_end1(__id), _M_end2(__e._M_end1)
357 { }
358
359 // Constructs a copy of a %_StateSeq
360 _StateSeq(const _StateSeq& __rhs)
361 : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
362 _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
363 { }
364
365
366 _StateSeq& operator=(const _StateSeq& __rhs);
367
368 _StateIdT
369 _M_front() const
370 { return _M_start; }
371
372 // Extends a sequence by one.
373 void
374 _M_push_back(_StateIdT __id);
375
376 // Extends and maybe joins a sequence.
377 void
378 _M_append(_StateIdT __id);
379
380 void
381 _M_append(_StateSeq& __rhs);
382
383 // Clones an entire sequence.
384 _StateIdT
385 _M_clone();
386
387 private:
388 _Nfa& _M_nfa;
389 _StateIdT _M_start;
390 _StateIdT _M_end1;
391 _StateIdT _M_end2;
392
393 };
394
395 _GLIBCXX_END_NAMESPACE_VERSION
396 } // namespace __regex
397 } // namespace std
398
399 #include <bits/regex_nfa.tcc>
400
401