1 /*
2 * re_lexer.cpp -- implementation of Lexer class
3 *
4 * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5 * Alexander Gololobov <agololobov@gmail.com>
6 *
7 * This file is part of Pire, the Perl Incompatible
8 * Regular Expressions library.
9 *
10 * Pire is free software: you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser Public License as published by
12 * the Free Software Foundation, either version 3 of the License, or
13 * (at your option) any later version.
14 *
15 * Pire is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser Public License for more details.
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Pire. If not, see <http://www.gnu.org/licenses>.
21 */
22
23
24 #include <ctype.h>
25 #include "stub/utf8.h"
26 #include "stub/singleton.h"
27 #include <stdexcept>
28 #include "re_lexer.h"
29 #include "re_parser.h"
30 #include "read_unicode.h"
31 #include "fsm.h"
32 #include "stub/stl.h"
33
34 namespace Pire {
35
36 namespace Impl {
37 int yre_parse(Pire::Lexer& lexer);
38 }
39
Character(wchar32 c)40 Term Term::Character(wchar32 c) { Term::CharacterRange cr; cr.first.insert(Term::String(1, c)); cr.second = false; return Term(TokenTypes::Letters, cr); }
Repetition(int lower,int upper)41 Term Term::Repetition(int lower, int upper) { return Term(TokenTypes::Count, RepetitionCount(lower, upper)); }
Dot()42 Term Term::Dot() { return Term(TokenTypes::Dot, DotTag()); }
BeginMark()43 Term Term::BeginMark() { return Term(TokenTypes::BeginMark, BeginTag()); }
EndMark()44 Term Term::EndMark() { return Term(TokenTypes::EndMark, EndTag()); }
45
46 Lexer::~Lexer() = default;
47
GetChar()48 wchar32 Lexer::GetChar()
49 {
50 if (m_input.empty())
51 return End;
52 else if (m_input.front() == '\\') {
53 m_input.pop_front();
54 if (m_input.empty())
55 Error("Regexp must not end with a backslash");
56 wchar32 ch = m_input.front();
57 m_input.pop_front();
58 return Control | ch;
59 } else {
60 wchar32 ch = m_input.front();
61 m_input.pop_front();
62 return ch;
63 }
64 }
65
PeekChar()66 wchar32 Lexer::PeekChar()
67 {
68 if (m_input.empty())
69 return End;
70 else
71 return m_input.front();
72 }
73
UngetChar(wchar32 c)74 void Lexer::UngetChar(wchar32 c)
75 {
76 if (c != End)
77 m_input.push_front(c);
78 }
79
80 namespace {
81 class CompareFeaturesByPriority: public ybinary_function<const Feature::Ptr&, const Feature::Ptr&, bool> {
82 public:
operator ()(const Feature::Ptr & a,const Feature::Ptr & b) const83 bool operator()(const Feature::Ptr& a, const Feature::Ptr& b) const
84 {
85 return a->Priority() < b->Priority();
86 }
87 };
88 }
89
AddFeature(Feature::Ptr & feature)90 Lexer& Lexer::AddFeature(Feature::Ptr& feature)
91 {
92 feature->m_lexer = this;
93 m_features.insert(LowerBound(m_features.begin(), m_features.end(), feature, CompareFeaturesByPriority()), std::move(feature));
94 return *this;
95 }
96
AddFeature(Feature::Ptr && feature)97 Lexer& Lexer::AddFeature(Feature::Ptr&& feature)
98 {
99 feature->m_lexer = this;
100 m_features.insert(LowerBound(m_features.begin(), m_features.end(), feature, CompareFeaturesByPriority()), std::move(feature));
101 return *this;
102 }
103
DoLex()104 Term Lexer::DoLex()
105 {
106 static const char* controls = "|().*+?^$\\";
107 for (;;) {
108 UngetChar(GetChar());
109 wchar32 ch = PeekChar();
110 if (ch == End)
111 return Term(TokenTypes::End);
112 for (auto&& i : m_features) {
113 if (i->Accepts(ch)) {
114 Term ret = i->Lex();
115 if (ret.Type())
116 return ret;
117 }
118 }
119 ch = GetChar();
120
121 if (ch == '|')
122 return Term(TokenTypes::Or);
123 else if (ch == '(') {
124 return Term(TokenTypes::Open);
125 } else if (ch == ')')
126 return Term(TokenTypes::Close);
127 else if (ch == '.')
128 return Term::Dot();
129 else if (ch == '*')
130 return Term::Repetition(0, Inf);
131 else if (ch == '+')
132 return Term::Repetition(1, Inf);
133 else if (ch == '?')
134 return Term::Repetition(0, 1);
135 else if (ch == '^')
136 return Term::BeginMark();
137 else if (ch == '$')
138 return Term::EndMark();
139 else if ((ch & ControlMask) == Control && strchr(controls, ch & ~ControlMask))
140 return Term::Character(ch & ~ControlMask);
141 else
142 return Term::Character(ch);
143 }
144 }
145
Lex()146 Term Lexer::Lex()
147 {
148 Term t = DoLex();
149
150 for (auto i = m_features.rbegin(), ie = m_features.rend(); i != ie; ++i)
151 (*i)->Alter(t);
152
153 if (t.Value().IsA<Term::CharacterRange>()) {
154 const auto& chars = t.Value().As<Term::CharacterRange>();
155 //std::cerr << "lex: type " << t.type() << "; chars = { " << join(chars.first.begin(), chars.first.end(), ", ") << " }" << std::endl;
156 for (auto&& i : chars.first)
157 for (auto&& j : i)
158 if ((j & ControlMask) == Control)
159 Error("Control character in tokens sequence");
160 }
161
162 int type = t.Type();
163 if (type == TokenTypes::Letters)
164 type = YRE_LETTERS;
165 else if (type == TokenTypes::Count)
166 type = YRE_COUNT;
167 else if (type == TokenTypes::Dot)
168 type = YRE_DOT;
169 else if (type == TokenTypes::Open)
170 type = '(';
171 else if (type == TokenTypes::Close)
172 type = ')';
173 else if (type == TokenTypes::Or)
174 type = '|';
175 else if (type == TokenTypes::And)
176 type = YRE_AND;
177 else if (type == TokenTypes::Not)
178 type = YRE_NOT;
179 else if (type == TokenTypes::BeginMark)
180 type = '^';
181 else if (type == TokenTypes::EndMark)
182 type = '$';
183 else if (type == TokenTypes::End)
184 type = 0;
185 return Term(type, t.Value());
186 }
187
Parenthesized(Fsm & fsm)188 void Lexer::Parenthesized(Fsm& fsm)
189 {
190 for (auto i = m_features.rbegin(), ie = m_features.rend(); i != ie; ++i)
191 (*i)->Parenthesized(fsm);
192 }
193
CorrectChar(wchar32 c,const char * controls)194 wchar32 Feature::CorrectChar(wchar32 c, const char* controls)
195 {
196 bool ctrl = (strchr(controls, c & 0xFF) != 0);
197 if ((c & ControlMask) == Control && ctrl)
198 return c & ~ControlMask;
199 if (c <= 0xFF && ctrl)
200 return c | Control;
201 return c;
202 }
203
204 namespace {
205 class EnableUnicodeSequencesImpl : public UnicodeReader {
206 public:
Accepts(wchar32 c) const207 bool Accepts(wchar32 c) const {
208 return c == (Control | 'x');
209 }
210
Lex()211 Term Lex() {
212 return Term::Character(ReadUnicodeCharacter());
213 }
214 };
215
216 class CharacterRangeReader: public UnicodeReader {
217 public:
Accepts(wchar32 c) const218 bool Accepts(wchar32 c) const { return c == '[' || c == (Control | '[') || c == (Control | ']'); }
219
Lex()220 Term Lex()
221 {
222 static const char* controls = "^[]-\\";
223 static const char* controls2 = "*+{}()$?.&~";
224 wchar32 ch = CorrectChar(GetChar(), controls);
225 if (ch == '[' || ch == ']')
226 return Term::Character(ch);
227
228 Term::CharacterRange cs;
229 ch = CorrectChar(GetChar(), controls);
230 if (ch == (Control | '^')) {
231 cs.second = true;
232 ch = CorrectChar(GetChar(), controls);
233 }
234
235 bool firstUnicode;
236 wchar32 unicodeSymbol = 0;
237
238 for (; ch != End && ch != (Control | ']'); ch = CorrectChar(GetChar(), controls)) {
239 if (ch == (Control | 'x')) {
240 UngetChar(ch);
241 firstUnicode = true;
242 unicodeSymbol = ReadUnicodeCharacter();
243 } else {
244 firstUnicode = false;
245 }
246
247 if (((ch & ControlMask) != Control || firstUnicode) && CorrectChar(PeekChar(), controls) == (Control | '-')) {
248 GetChar();
249 wchar32 current = GetChar();
250
251 bool secondUnicode = (current == (Control | 'x'));
252
253 wchar32 begin = (firstUnicode) ? unicodeSymbol : ch;
254 wchar32 end;
255 if (secondUnicode) {
256 UngetChar(current);
257 end = ReadUnicodeCharacter();
258 } else {
259 end = CorrectChar(current, controls);
260 if ((end & ControlMask) == Control)
261 Error("Wrong character range");
262 }
263
264 for (ch = begin; ch <= end; ++ch) {
265 cs.first.insert(Term::String(1, ch));
266 }
267 } else if (ch == (Control | '-')) {
268 cs.first.insert(Term::String(1, '-'));
269 }
270 else if ((ch & ControlMask) == Control && (strchr(controls2, ch & ~ControlMask) || strchr(controls, ch & ~ControlMask))) {
271 cs.first.insert(Term::String(1, ch & ~ControlMask));
272 }
273 else if ((ch & ControlMask) != Control || !strchr(controls, ch & ~ControlMask)) {
274 cs.first.insert(Term::String(1, (firstUnicode) ? unicodeSymbol : ch));
275 } else {
276 Error("Wrong character in range");
277 }
278 }
279 if (ch == End)
280 Error("Unexpected end of pattern");
281
282 return Term(TokenTypes::Letters, cs);
283 }
284 };
285
286 class RepetitionCountReader: public Feature {
287 public:
Accepts(wchar32 c) const288 bool Accepts(wchar32 c) const { return c == '{' || c == (Control | '{') || c == (Control | '}'); }
289
Lex()290 Term Lex()
291 {
292 wchar32 ch = GetChar();
293 if (ch == (Control | '{') || ch == (Control | '}'))
294 return Term::Character(ch & ~ControlMask);
295 ch = GetChar();
296 int lower = 0, upper = 0;
297
298 if (!is_digit(ch))
299 Error("Wrong repetition count");
300
301 for (; is_digit(ch); ch = GetChar())
302 lower = lower * 10 + (ch - '0');
303 if (ch == '}')
304 return Term::Repetition(lower, lower);
305 else if (ch != ',')
306 Error("Wrong repetition count");
307
308 ch = GetChar();
309 if (ch == '}')
310 return Term::Repetition(lower, Inf);
311 else if (!is_digit(ch))
312 Error("Wrong repetition count");
313 for (; is_digit(ch); ch = GetChar())
314 upper = upper * 10 + (ch - '0');
315
316 if (ch != '}')
317 Error("Wrong repetition count");
318 return Term::Repetition(lower, upper);
319 }
320 };
321
322 class CaseInsensitiveImpl: public Feature {
323 public:
Alter(Term & t)324 void Alter(Term& t)
325 {
326 if (t.Value().IsA<Term::CharacterRange>()) {
327 typedef Term::CharacterRange::first_type CharSet;
328 const CharSet& old = t.Value().As<Term::CharacterRange>().first;
329 CharSet altered;
330 for (auto&& i : old) {
331 if (i.size() == 1) {
332 altered.insert(Term::String(1, to_upper(i[0])));
333 altered.insert(Term::String(1, to_lower(i[0])));
334 } else
335 altered.insert(i);
336 }
337 t = Term(t.Type(), Term::CharacterRange(altered, t.Value().As<Term::CharacterRange>().second));
338 }
339 }
340 };
341 class AndNotSupportImpl: public Feature {
342 public:
Accepts(wchar32 c) const343 bool Accepts(wchar32 c) const
344 {
345 return c == '&' || c == '~' || c == (Control | '&') || c == (Control | '~');
346 }
347
Lex()348 Term Lex()
349 {
350 wchar32 ch = GetChar();
351 if (ch == (Control | '&') || ch == (Control | '~'))
352 return Term::Character(ch & ~ControlMask);
353 else if (ch == '&')
354 return Term(TokenTypes::And);
355 else if (ch == '~')
356 return Term(TokenTypes::Not);
357 else {
358 Error("Pire::AndNotSupport::Lex(): strange input character");
359 return Term(0); // Make compiler happy
360 }
361 }
362 };
363 }
364
365 namespace Features {
CaseInsensitive()366 Feature::Ptr CaseInsensitive() { return Feature::Ptr(new CaseInsensitiveImpl); }
367 Feature::Ptr CharClasses();
AndNotSupport()368 Feature::Ptr AndNotSupport() { return Feature::Ptr(new AndNotSupportImpl); }
369 };
370
InstallDefaultFeatures()371 void Lexer::InstallDefaultFeatures()
372 {
373 AddFeature(Feature::Ptr(new CharacterRangeReader));
374 AddFeature(Feature::Ptr(new RepetitionCountReader));
375 AddFeature(Features::CharClasses());
376 AddFeature(Feature::Ptr(new EnableUnicodeSequencesImpl));
377 }
378
Parse()379 Fsm Lexer::Parse()
380 {
381 if (!Impl::yre_parse(*this))
382 return m_retval.As<Fsm>();
383 else {
384 Error("Syntax error in regexp");
385 return Fsm(); // Make compiler happy
386 }
387 }
388
389 }
390