1 /*
2  * re_lexer.cpp -- implementation of Lexer class
3  *
4  * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5  *                          Alexander Gololobov <agololobov@gmail.com>
6  *
7  * This file is part of Pire, the Perl Incompatible
8  * Regular Expressions library.
9  *
10  * Pire is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * Pire is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser Public License for more details.
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Pire.  If not, see <http://www.gnu.org/licenses>.
21  */
22 
23 
24 #include <ctype.h>
25 #include "stub/utf8.h"
26 #include "stub/singleton.h"
27 #include <stdexcept>
28 #include "re_lexer.h"
29 #include "re_parser.h"
30 #include "read_unicode.h"
31 #include "fsm.h"
32 #include "stub/stl.h"
33 
34 namespace Pire {
35 
36 namespace Impl {
37 	int yre_parse(Pire::Lexer& lexer);
38 }
39 
Character(wchar32 c)40 Term Term::Character(wchar32 c) { Term::CharacterRange cr; cr.first.insert(Term::String(1, c)); cr.second = false; return Term(TokenTypes::Letters, cr); }
Repetition(int lower,int upper)41 Term Term::Repetition(int lower, int upper) { return Term(TokenTypes::Count, RepetitionCount(lower, upper)); }
Dot()42 Term Term::Dot() { return Term(TokenTypes::Dot, DotTag()); }
BeginMark()43 Term Term::BeginMark() { return Term(TokenTypes::BeginMark, BeginTag()); }
EndMark()44 Term Term::EndMark() { return Term(TokenTypes::EndMark, EndTag()); }
45 
46 Lexer::~Lexer() = default;
47 
GetChar()48 wchar32 Lexer::GetChar()
49 {
50 	if (m_input.empty())
51 		return End;
52 	else if (m_input.front() == '\\') {
53 		m_input.pop_front();
54 		if (m_input.empty())
55 			Error("Regexp must not end with a backslash");
56 		wchar32 ch = m_input.front();
57 		m_input.pop_front();
58 		return Control | ch;
59 	} else {
60 		wchar32 ch = m_input.front();
61 		m_input.pop_front();
62 		return ch;
63 	}
64 }
65 
PeekChar()66 wchar32 Lexer::PeekChar()
67 {
68 	if (m_input.empty())
69 		return End;
70 	else
71 		return m_input.front();
72 }
73 
UngetChar(wchar32 c)74 void Lexer::UngetChar(wchar32 c)
75 {
76 	if (c != End)
77 		m_input.push_front(c);
78 }
79 
80 namespace {
81 	class CompareFeaturesByPriority: public ybinary_function<const Feature::Ptr&, const Feature::Ptr&, bool> {
82 	public:
operator ()(const Feature::Ptr & a,const Feature::Ptr & b) const83 		bool operator()(const Feature::Ptr& a, const Feature::Ptr& b) const
84 		{
85 			return a->Priority() < b->Priority();
86 		}
87 	};
88 }
89 
AddFeature(Feature::Ptr & feature)90 Lexer& Lexer::AddFeature(Feature::Ptr& feature)
91 {
92 	feature->m_lexer = this;
93 	m_features.insert(LowerBound(m_features.begin(), m_features.end(), feature, CompareFeaturesByPriority()), std::move(feature));
94 	return *this;
95 }
96 
AddFeature(Feature::Ptr && feature)97 Lexer& Lexer::AddFeature(Feature::Ptr&& feature)
98 {
99     feature->m_lexer = this;
100     m_features.insert(LowerBound(m_features.begin(), m_features.end(), feature, CompareFeaturesByPriority()), std::move(feature));
101     return *this;
102 }
103 
DoLex()104 Term Lexer::DoLex()
105 {
106 	static const char* controls = "|().*+?^$\\";
107 	for (;;) {
108 		UngetChar(GetChar());
109 		wchar32 ch = PeekChar();
110 		if (ch == End)
111 			return Term(TokenTypes::End);
112 		for (auto&& i : m_features) {
113 			if (i->Accepts(ch)) {
114 				Term ret = i->Lex();
115 				if (ret.Type())
116 					return ret;
117 			}
118 		}
119 		ch = GetChar();
120 
121 		if (ch == '|')
122 			return Term(TokenTypes::Or);
123 		else if (ch == '(') {
124 			return Term(TokenTypes::Open);
125 		} else if (ch == ')')
126 			return Term(TokenTypes::Close);
127 		else if (ch == '.')
128 			return Term::Dot();
129 		else if (ch == '*')
130 			return Term::Repetition(0, Inf);
131 		else if (ch == '+')
132 			return Term::Repetition(1, Inf);
133 		else if (ch == '?')
134 			return Term::Repetition(0, 1);
135 		else if (ch == '^')
136 			return Term::BeginMark();
137 		else if (ch == '$')
138 			return Term::EndMark();
139 		else if ((ch & ControlMask) == Control && strchr(controls, ch & ~ControlMask))
140 			return Term::Character(ch & ~ControlMask);
141 		else
142 			return Term::Character(ch);
143 	}
144 }
145 
Lex()146 Term Lexer::Lex()
147 {
148 	Term t = DoLex();
149 
150 	for (auto i = m_features.rbegin(), ie = m_features.rend(); i != ie; ++i)
151 		(*i)->Alter(t);
152 
153 	if (t.Value().IsA<Term::CharacterRange>()) {
154 		const auto& chars = t.Value().As<Term::CharacterRange>();
155 		//std::cerr << "lex: type " << t.type() << "; chars = { " << join(chars.first.begin(), chars.first.end(), ", ") << " }" << std::endl;
156 		for (auto&& i : chars.first)
157 			for (auto&& j : i)
158 				if ((j & ControlMask) == Control)
159 					Error("Control character in tokens sequence");
160 	}
161 
162 	int type = t.Type();
163 	if (type == TokenTypes::Letters)
164 		type = YRE_LETTERS;
165 	else if (type == TokenTypes::Count)
166 		type = YRE_COUNT;
167 	else if (type == TokenTypes::Dot)
168 		type = YRE_DOT;
169 	else if (type == TokenTypes::Open)
170 		type = '(';
171 	else if (type == TokenTypes::Close)
172 		type = ')';
173 	else if (type == TokenTypes::Or)
174 		type = '|';
175 	else if (type == TokenTypes::And)
176 		type = YRE_AND;
177 	else if (type == TokenTypes::Not)
178 		type = YRE_NOT;
179 	else if (type == TokenTypes::BeginMark)
180 		type = '^';
181 	else if (type == TokenTypes::EndMark)
182 		type = '$';
183 	else if (type == TokenTypes::End)
184 		type = 0;
185 	return Term(type, t.Value());
186 }
187 
Parenthesized(Fsm & fsm)188 void Lexer::Parenthesized(Fsm& fsm)
189 {
190 	for (auto i = m_features.rbegin(), ie = m_features.rend(); i != ie; ++i)
191 		(*i)->Parenthesized(fsm);
192 }
193 
CorrectChar(wchar32 c,const char * controls)194 wchar32 Feature::CorrectChar(wchar32 c, const char* controls)
195 {
196 	bool ctrl = (strchr(controls, c & 0xFF) != 0);
197 	if ((c & ControlMask) == Control && ctrl)
198 		return c & ~ControlMask;
199 	if (c <= 0xFF && ctrl)
200 		return c | Control;
201 	return c;
202 }
203 
204 namespace {
205 	class EnableUnicodeSequencesImpl : public UnicodeReader {
206 	public:
Accepts(wchar32 c) const207 		bool Accepts(wchar32 c) const {
208 			return c == (Control | 'x');
209 		}
210 
Lex()211 		Term Lex() {
212 			return Term::Character(ReadUnicodeCharacter());
213 		}
214 	};
215 
216 	class CharacterRangeReader: public UnicodeReader {
217 	public:
Accepts(wchar32 c) const218 		bool Accepts(wchar32 c) const { return c == '[' || c == (Control | '[') || c == (Control | ']'); }
219 
Lex()220 		Term Lex()
221 		{
222 			static const char* controls = "^[]-\\";
223 			static const char* controls2 = "*+{}()$?.&~";
224 			wchar32 ch = CorrectChar(GetChar(), controls);
225 			if (ch == '[' || ch == ']')
226 				return Term::Character(ch);
227 
228 			Term::CharacterRange cs;
229 			ch = CorrectChar(GetChar(), controls);
230 			if (ch == (Control | '^')) {
231 				cs.second = true;
232 				ch = CorrectChar(GetChar(), controls);
233 			}
234 
235 			bool firstUnicode;
236 			wchar32 unicodeSymbol = 0;
237 
238 			for (; ch != End && ch != (Control | ']'); ch = CorrectChar(GetChar(), controls)) {
239 				if (ch == (Control | 'x')) {
240 					UngetChar(ch);
241 					firstUnicode = true;
242 					unicodeSymbol = ReadUnicodeCharacter();
243 				} else {
244 					firstUnicode = false;
245 				}
246 
247 				if (((ch & ControlMask) != Control || firstUnicode) && CorrectChar(PeekChar(), controls) == (Control | '-')) {
248 					GetChar();
249 					wchar32 current = GetChar();
250 
251 					bool secondUnicode = (current == (Control | 'x'));
252 
253 					wchar32 begin = (firstUnicode) ? unicodeSymbol : ch;
254 					wchar32 end;
255 					if (secondUnicode) {
256 						UngetChar(current);
257 						end = ReadUnicodeCharacter();
258 					} else {
259 						end = CorrectChar(current, controls);
260 						if ((end & ControlMask) == Control)
261 							Error("Wrong character range");
262 					}
263 
264 					for (ch = begin; ch <= end; ++ch) {
265 						cs.first.insert(Term::String(1, ch));
266 					}
267 				} else if (ch == (Control | '-')) {
268 					cs.first.insert(Term::String(1, '-'));
269 				}
270 				else if ((ch & ControlMask) == Control && (strchr(controls2, ch & ~ControlMask) || strchr(controls, ch & ~ControlMask))) {
271 					cs.first.insert(Term::String(1, ch & ~ControlMask));
272 				}
273 				else if ((ch & ControlMask) != Control || !strchr(controls, ch & ~ControlMask)) {
274 					cs.first.insert(Term::String(1, (firstUnicode) ? unicodeSymbol : ch));
275 				} else {
276 					Error("Wrong character in range");
277 				}
278 			}
279 			if (ch == End)
280 				Error("Unexpected end of pattern");
281 
282 			return Term(TokenTypes::Letters, cs);
283 		}
284 	};
285 
286 	class RepetitionCountReader: public Feature {
287 	public:
Accepts(wchar32 c) const288 		bool Accepts(wchar32 c) const { return c == '{' || c == (Control | '{') || c == (Control | '}'); }
289 
Lex()290 		Term Lex()
291 		{
292 			wchar32 ch = GetChar();
293 			if (ch == (Control | '{') || ch == (Control | '}'))
294 				return Term::Character(ch & ~ControlMask);
295 			ch = GetChar();
296 			int lower = 0, upper = 0;
297 
298 			if (!is_digit(ch))
299 				Error("Wrong repetition count");
300 
301 			for (; is_digit(ch); ch = GetChar())
302 				lower = lower * 10 + (ch - '0');
303 			if (ch == '}')
304 				return Term::Repetition(lower, lower);
305 			else if (ch != ',')
306 				Error("Wrong repetition count");
307 
308 			ch = GetChar();
309 			if (ch == '}')
310 				return Term::Repetition(lower, Inf);
311 			else if (!is_digit(ch))
312 				Error("Wrong repetition count");
313 			for (; is_digit(ch); ch = GetChar())
314 				upper = upper * 10 + (ch - '0');
315 
316 			if (ch != '}')
317 				Error("Wrong repetition count");
318 			return Term::Repetition(lower, upper);
319 		}
320 	};
321 
322 	class CaseInsensitiveImpl: public Feature {
323 	public:
Alter(Term & t)324 		void Alter(Term& t)
325 		{
326 			if (t.Value().IsA<Term::CharacterRange>()) {
327 				typedef Term::CharacterRange::first_type CharSet;
328 				const CharSet& old = t.Value().As<Term::CharacterRange>().first;
329 				CharSet altered;
330 				for (auto&& i : old) {
331 					if (i.size() == 1) {
332 						altered.insert(Term::String(1, to_upper(i[0])));
333 						altered.insert(Term::String(1, to_lower(i[0])));
334 					} else
335 						altered.insert(i);
336 				}
337 				t = Term(t.Type(), Term::CharacterRange(altered, t.Value().As<Term::CharacterRange>().second));
338 			}
339 		}
340 	};
341 	class AndNotSupportImpl: public Feature {
342 	public:
Accepts(wchar32 c) const343 		bool Accepts(wchar32 c) const
344 		{
345 			return c == '&' || c == '~' || c == (Control | '&') || c == (Control | '~');
346 		}
347 
Lex()348 		Term Lex()
349 		{
350 			wchar32 ch = GetChar();
351 			if (ch == (Control | '&') || ch == (Control | '~'))
352 				return Term::Character(ch & ~ControlMask);
353 			else if (ch == '&')
354 				return Term(TokenTypes::And);
355 			else if (ch == '~')
356 				return Term(TokenTypes::Not);
357 			else {
358 				Error("Pire::AndNotSupport::Lex(): strange input character");
359 				return Term(0); // Make compiler happy
360 			}
361 		}
362 	};
363 }
364 
365 namespace Features {
CaseInsensitive()366 	Feature::Ptr CaseInsensitive() { return Feature::Ptr(new CaseInsensitiveImpl); }
367 	Feature::Ptr CharClasses();
AndNotSupport()368 	Feature::Ptr AndNotSupport() { return Feature::Ptr(new AndNotSupportImpl); }
369 };
370 
InstallDefaultFeatures()371 void Lexer::InstallDefaultFeatures()
372 {
373 	AddFeature(Feature::Ptr(new CharacterRangeReader));
374 	AddFeature(Feature::Ptr(new RepetitionCountReader));
375 	AddFeature(Features::CharClasses());
376 	AddFeature(Feature::Ptr(new EnableUnicodeSequencesImpl));
377 }
378 
Parse()379 Fsm Lexer::Parse()
380 {
381 	if (!Impl::yre_parse(*this))
382 		return m_retval.As<Fsm>();
383 	else {
384 		Error("Syntax error in regexp");
385 		return Fsm(); // Make compiler happy
386 	}
387 }
388 
389 }
390