1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2018 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex_scanner.h
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 
35 namespace __detail
36 {
37   /**
38    * @addtogroup regex-detail
39    * @{
40    */
41 
42   struct _ScannerBase
43   {
44   public:
45     /// Token types returned from the scanner.
46     enum _TokenT : unsigned
47     {
48       _S_token_anychar,
49       _S_token_ord_char,
50       _S_token_oct_num,
51       _S_token_hex_num,
52       _S_token_backref,
53       _S_token_subexpr_begin,
54       _S_token_subexpr_no_group_begin,
55       _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
56       _S_token_subexpr_end,
57       _S_token_bracket_begin,
58       _S_token_bracket_neg_begin,
59       _S_token_bracket_end,
60       _S_token_interval_begin,
61       _S_token_interval_end,
62       _S_token_quoted_class,
63       _S_token_char_class_name,
64       _S_token_collsymbol,
65       _S_token_equiv_class_name,
66       _S_token_opt,
67       _S_token_or,
68       _S_token_closure0,
69       _S_token_closure1,
70       _S_token_line_begin,
71       _S_token_line_end,
72       _S_token_word_bound, // neg if _M_value[0] == 'n'
73       _S_token_comma,
74       _S_token_dup_count,
75       _S_token_eof,
76       _S_token_bracket_dash,
77       _S_token_unknown = -1u
78     };
79 
80   protected:
81     typedef regex_constants::syntax_option_type _FlagT;
82 
83     enum _StateT
84     {
85       _S_state_normal,
86       _S_state_in_brace,
87       _S_state_in_bracket,
88     };
89 
90   protected:
91     _ScannerBase(_FlagT __flags)
92     : _M_state(_S_state_normal),
93     _M_flags(__flags),
94     _M_escape_tbl(_M_is_ecma()
95 		  ? _M_ecma_escape_tbl
96 		  : _M_awk_escape_tbl),
97     _M_spec_char(_M_is_ecma()
98 		 ? _M_ecma_spec_char
99 		 : _M_flags & regex_constants::basic
100 		 ? _M_basic_spec_char
101 		 : _M_flags & regex_constants::extended
102 		 ? _M_extended_spec_char
103 		 : _M_flags & regex_constants::grep
104 		 ?  ".[\\*^$\n"
105 		 : _M_flags & regex_constants::egrep
106 		 ? ".[\\()*+?{|^$\n"
107 		 : _M_flags & regex_constants::awk
108 		 ? _M_extended_spec_char
109 		 : nullptr),
110     _M_at_bracket_start(false)
111     { __glibcxx_assert(_M_spec_char); }
112 
113   protected:
114     const char*
115     _M_find_escape(char __c)
116     {
117       auto __it = _M_escape_tbl;
118       for (; __it->first != '\0'; ++__it)
119 	if (__it->first == __c)
120 	  return &__it->second;
121       return nullptr;
122     }
123 
124     bool
125     _M_is_ecma() const
126     { return _M_flags & regex_constants::ECMAScript; }
127 
128     bool
129     _M_is_basic() const
130     { return _M_flags & (regex_constants::basic | regex_constants::grep); }
131 
132     bool
133     _M_is_extended() const
134     {
135       return _M_flags & (regex_constants::extended
136 			 | regex_constants::egrep
137 			 | regex_constants::awk);
138     }
139 
140     bool
141     _M_is_grep() const
142     { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
143 
144     bool
145     _M_is_awk() const
146     { return _M_flags & regex_constants::awk; }
147 
148   protected:
149     // TODO: Make them static in the next abi change.
150     const std::pair<char, _TokenT> _M_token_tbl[9] =
151       {
152 	{'^', _S_token_line_begin},
153 	{'$', _S_token_line_end},
154 	{'.', _S_token_anychar},
155 	{'*', _S_token_closure0},
156 	{'+', _S_token_closure1},
157 	{'?', _S_token_opt},
158 	{'|', _S_token_or},
159 	{'\n', _S_token_or}, // grep and egrep
160 	{'\0', _S_token_or},
161       };
162     const std::pair<char, char> _M_ecma_escape_tbl[8] =
163       {
164 	{'0', '\0'},
165 	{'b', '\b'},
166 	{'f', '\f'},
167 	{'n', '\n'},
168 	{'r', '\r'},
169 	{'t', '\t'},
170 	{'v', '\v'},
171 	{'\0', '\0'},
172       };
173     const std::pair<char, char> _M_awk_escape_tbl[11] =
174       {
175 	{'"', '"'},
176 	{'/', '/'},
177 	{'\\', '\\'},
178 	{'a', '\a'},
179 	{'b', '\b'},
180 	{'f', '\f'},
181 	{'n', '\n'},
182 	{'r', '\r'},
183 	{'t', '\t'},
184 	{'v', '\v'},
185 	{'\0', '\0'},
186       };
187     const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
188     const char* _M_basic_spec_char = ".[\\*^$";
189     const char* _M_extended_spec_char = ".[\\()*+?{|^$";
190 
191     _StateT                       _M_state;
192     _FlagT                        _M_flags;
193     _TokenT                       _M_token;
194     const std::pair<char, char>*  _M_escape_tbl;
195     const char*                   _M_spec_char;
196     bool                          _M_at_bracket_start;
197   };
198 
199   /**
200    * @brief Scans an input range for regex tokens.
201    *
202    * The %_Scanner class interprets the regular expression pattern in
203    * the input range passed to its constructor as a sequence of parse
204    * tokens passed to the regular expression compiler.  The sequence
205    * of tokens provided depends on the flag settings passed to the
206    * constructor: different regular expression grammars will interpret
207    * the same input pattern in syntactically different ways.
208    */
209   template<typename _CharT>
210     class _Scanner
211     : public _ScannerBase
212     {
213     public:
214       typedef const _CharT*                                       _IterT;
215       typedef std::basic_string<_CharT>                           _StringT;
216       typedef regex_constants::syntax_option_type                 _FlagT;
217       typedef const std::ctype<_CharT>                            _CtypeT;
218 
219       _Scanner(_IterT __begin, _IterT __end,
220 	       _FlagT __flags, std::locale __loc);
221 
222       void
223       _M_advance();
224 
225       _TokenT
226       _M_get_token() const
227       { return _M_token; }
228 
229       const _StringT&
230       _M_get_value() const
231       { return _M_value; }
232 
233 #ifdef _GLIBCXX_DEBUG
234       std::ostream&
235       _M_print(std::ostream&);
236 #endif
237 
238     private:
239       void
240       _M_scan_normal();
241 
242       void
243       _M_scan_in_bracket();
244 
245       void
246       _M_scan_in_brace();
247 
248       void
249       _M_eat_escape_ecma();
250 
251       void
252       _M_eat_escape_posix();
253 
254       void
255       _M_eat_escape_awk();
256 
257       void
258       _M_eat_class(char);
259 
260       _IterT                        _M_current;
261       _IterT                        _M_end;
262       _CtypeT&                      _M_ctype;
263       _StringT                      _M_value;
264       void (_Scanner::* _M_eat_escape)();
265     };
266 
267  //@} regex-detail
268 } // namespace __detail
269 _GLIBCXX_END_NAMESPACE_VERSION
270 } // namespace std
271 
272 #include <bits/regex_scanner.tcc>
273