1 /*
2  * slow.h -- definition of the SlowScanner
3  *
4  * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5  *                          Alexander Gololobov <agololobov@gmail.com>
6  *
7  * This file is part of Pire, the Perl Incompatible
8  * Regular Expressions library.
9  *
10  * Pire is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * Pire is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser Public License for more details.
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Pire.  If not, see <http://www.gnu.org/licenses>.
21  */
22 
23 
24 #ifndef PIRE_SCANNERS_SLOW_H
25 #define PIRE_SCANNERS_SLOW_H
26 
27 #include "common.h"
28 #include "../approx_matching.h"
29 #include "../stub/stl.h"
30 #include "../partition.h"
31 #include "../vbitset.h"
32 #include "../fsm.h"
33 #include "../run.h"
34 #include "../stub/saveload.h"
35 
36 #ifdef PIRE_DEBUG
37 #include <iostream>
38 #include "../stub/lexical_cast.h"
39 #endif
40 
41 namespace Pire {
42 
43 /**
44  * A 'slow' scanner.
45  * Takes O( str.length() * this->m_states.size() ) time to scan string,
46  * but does not require FSM to be deterministic.
47  * Thus can be used to handle something sorta /x.{40}$/,
48  * where deterministic FSM contains 2^40 states and hence cannot fit
49  * in memory.
50  */
51 class SlowScanner {
52 public:
53 	typedef size_t      Transition;
54 	typedef ui16        Letter;
55 	typedef ui32        Action;
56 	typedef ui8         Tag;
57 
58 	enum {
59 		FinalFlag = 1,
60 		DeadFlag  = 0
61 	};
62 
63 	struct State {
64 		TVector<unsigned> states;
65 		BitSet flags;
66 
StateState67 		State() {}
StateState68 		State(size_t size): flags(size) { states.reserve(size); }
SwapState69 		void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); }
70 
71 #ifdef PIRE_DEBUG
72 		friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); }
73 #endif
74 	};
75 
76 	SlowScanner(bool needActions = false) {
77 		Alias(Null());
78 		need_actions = needActions;
79 	}
80 
GetLettersCount()81 	size_t GetLettersCount() const {return m.lettersCount; };
82 
Size()83 	size_t Size() const { return GetSize(); }
GetSize()84 	size_t GetSize() const { return m.statesCount; }
Empty()85 	bool Empty() const { return m_finals == Null().m_finals; }
86 
Id()87 	size_t Id() const {return (size_t) -1;}
RegexpsCount()88 	size_t RegexpsCount() const { return Empty() ? 0 : 1; }
89 
Initialize(State & state)90 	void Initialize(State& state) const
91 	{
92 		state.states.clear();
93 		state.states.reserve(m.statesCount);
94 		state.states.push_back(m.start);
95 		BitSet(m.statesCount).Swap(state.flags);
96 	}
97 
Translate(Char ch)98 	Char Translate(Char ch) const
99 	{
100 		return m_letters[static_cast<size_t>(ch)];
101 	}
102 
NextTranslated(const State & current,State & next,Char l)103 	Action NextTranslated(const State& current, State& next, Char l) const
104 	{
105 		next.flags.Clear();
106 		next.states.clear();
107 		for (auto&& state : current.states) {
108 			const unsigned* begin = 0;
109 			const unsigned* end = 0;
110 			if (!m_vecptr) {
111 				const size_t* pos = m_jumpPos + state * m.lettersCount + l;
112 				begin = m_jumps + pos[0];
113 				end = m_jumps + pos[1];
114 			} else {
115 				const auto& v = (*m_vecptr)[state * m.lettersCount + l];
116 				if (!v.empty()) {
117 					begin = &v[0];
118 					end = &v[0] + v.size();
119 				}
120 			}
121 
122 			for (; begin != end; ++begin)
123 				if (!next.flags.Test(*begin)) {
124 					next.flags.Set(*begin);
125 					next.states.push_back(*begin);
126 				}
127 		}
128 
129 		return 0;
130 	}
131 
Next(const State & current,State & next,Char c)132 	Action Next(const State& current, State& next, Char c) const
133 	{
134 		return NextTranslated(current, next, Translate(c));
135 	}
136 
TakeAction(State &,Action)137 	bool TakeAction(State&, Action) const { return false; }
138 
NextTranslated(State & s,Char l)139 	Action NextTranslated(State& s, Char l) const
140 	{
141 		State dest(m.statesCount);
142 		Action a = NextTranslated(s, dest, l);
143 		s.Swap(dest);
144 		return a;
145 	}
146 
Next(State & s,Char c)147 	Action Next(State& s, Char c) const
148 	{
149 		return NextTranslated(s, Translate(c));
150 	}
151 
Final(const State & s)152 	bool Final(const State& s) const
153 	{
154 		for (auto&& state : s.states)
155 			if (m_finals[state])
156 				return true;
157 		return false;
158 	}
159 
Dead(const State &)160 	bool Dead(const State&) const
161 	{
162 		return false;
163 	}
164 
AcceptedRegexps(const State & s)165 	ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
166 		return Final(s) ? Accept() : Deny();
167 	}
168 
CanStop(const State & s)169 	bool CanStop(const State& s) const {
170 		return Final(s);
171 	}
172 
Mmap(const void * ptr,size_t size)173 	const void* Mmap(const void* ptr, size_t size)
174 	{
175 		Impl::CheckAlign(ptr);
176 		SlowScanner s;
177 		const size_t* p = reinterpret_cast<const size_t*>(ptr);
178 
179 		Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m));
180 		Locals* locals;
181 		Impl::MapPtr(locals, 1, p, size);
182 		memcpy(&s.m, locals, sizeof(s.m));
183 
184 		bool empty = *((const bool*) p);
185 		Impl::AdvancePtr(p, size, sizeof(empty));
186 		Impl::AlignPtr(p, size);
187 
188 		if (empty)
189 			s.Alias(Null());
190 		else {
191 			s.m_vecptr = 0;
192 			Impl::MapPtr(s.m_letters, MaxChar, p, size);
193 			Impl::MapPtr(s.m_finals, s.m.statesCount, p, size);
194 			Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size);
195 			Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
196 			if (need_actions)
197 				Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
198 			Swap(s);
199 		}
200 		return (const void*) p;
201 	}
202 
Swap(SlowScanner & s)203 	void Swap(SlowScanner& s)
204 	{
205 		DoSwap(m_finals, s.m_finals);
206 		DoSwap(m_jumps, s.m_jumps);
207 		DoSwap(m_actions, s.m_actions);
208 		DoSwap(m_jumpPos, s.m_jumpPos);
209 		DoSwap(m.statesCount, s.m.statesCount);
210 		DoSwap(m.lettersCount, s.m.lettersCount);
211 		DoSwap(m.start, s.m.start);
212 		DoSwap(m_letters, s.m_letters);
213 		DoSwap(m_pool, s.m_pool);
214 		DoSwap(m_vec, s.m_vec);
215 
216 		DoSwap(m_vecptr, s.m_vecptr);
217 		DoSwap(need_actions, s.need_actions);
218 		DoSwap(m_actionsvec, s.m_actionsvec);
219 		if (m_vecptr == &s.m_vec)
220 			m_vecptr = &m_vec;
221 		if (s.m_vecptr == &m_vec)
222 			s.m_vecptr = &s.m_vec;
223 	}
224 
SlowScanner(const SlowScanner & s)225 	SlowScanner(const SlowScanner& s)
226 		: m(s.m)
227 		, m_vec(s.m_vec)
228 		, need_actions(s.need_actions)
229 		, m_actionsvec(s.m_actionsvec)
230 	{
231 		if (s.m_vec.empty()) {
232 			// Empty or mmap()-ed scanner, just copy pointers
233 			m_finals = s.m_finals;
234 			m_jumps = s.m_jumps;
235 			m_actions = s.m_actions;
236 			m_jumpPos = s.m_jumpPos;
237 			m_letters = s.m_letters;
238 			m_vecptr = 0;
239 		} else {
240 			// In-memory scanner, perform deep copy
241 			alloc(m_letters, MaxChar);
242 			memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar);
243 			m_jumps = 0;
244 			m_jumpPos = 0;
245 			m_actions = 0;
246 			alloc(m_finals, m.statesCount);
247 			memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount);
248 			m_vecptr = &m_vec;
249 		}
250 	}
251 
252 	explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0)
need_actions(needActions)253 		: need_actions(needActions)
254 	{
255 		if (distance) {
256 			fsm = CreateApproxFsm(fsm, distance);
257 		}
258 		if (removeEpsilons)
259 			fsm.RemoveEpsilons();
260 		fsm.Sparse(!removeEpsilons);
261 
262 		m.statesCount = fsm.Size();
263 		m.lettersCount = fsm.Letters().Size();
264 
265 		m_vec.resize(m.statesCount * m.lettersCount);
266 		if (need_actions)
267 			m_actionsvec.resize(m.statesCount * m.lettersCount);
268 		m_vecptr = &m_vec;
269 		alloc(m_letters, MaxChar);
270 		m_jumps = 0;
271 		m_actions = 0;
272 		m_jumpPos = 0;
273 		alloc(m_finals, m.statesCount);
274 
275 		// Build letter translation table
276 		Fill(m_letters, m_letters + MaxChar, 0);
277 		for (auto&& letter : fsm.Letters())
278 			for (auto&& character : letter.second.second)
279 				m_letters[character] = letter.second.first;
280 
281 		m.start = fsm.Initial();
282 		BuildScanner(fsm, *this);
283 	}
284 
285 	SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; }
286 
~SlowScanner()287 	~SlowScanner()
288 	{
289 		for (auto&& i : m_pool)
290 			free(i);
291 	}
292 
293 	void Save(yostream*) const;
294 	void Load(yistream*);
295 
StateIndex(const State & s)296 	const State& StateIndex(const State& s) const { return s; }
297 
298 protected:
IsMmaped()299 	bool IsMmaped() const
300 	{
301 		return (!m_vecptr);
302 	}
303 
GetJump(size_t pos)304 	size_t GetJump(size_t pos) const
305 	{
306 		return m_jumps[pos];
307 	}
308 
GetAction(size_t pos)309 	Action& GetAction(size_t pos) const
310 	{
311 		return m_actions[pos];
312 	}
313 
GetActionsVec(size_t from)314 	const TVector<Action>& GetActionsVec(size_t from) const
315 	{
316 		return m_actionsvec[from];
317 	}
318 
GetJumpsVec(size_t from)319 	const TVector<unsigned int>& GetJumpsVec(size_t from) const
320 	{
321 		return m_vec[from];
322 	}
323 
GetJumpPos()324 	size_t* GetJumpPos() const
325 	{
326 		return m_jumpPos;
327 	}
328 
GetStart()329 	size_t GetStart() const
330 	{
331 		return m.start;
332 	}
333 
IsFinal(size_t pos)334 	bool IsFinal(size_t pos) const
335 	{
336 		return m_finals[pos];
337 	}
338 
339 private:
340 
341 	struct Locals {
342 		size_t statesCount;
343 		size_t lettersCount;
344 		size_t start;
345 	} m;
346 
347 	bool* m_finals;
348 	unsigned* m_jumps;
349 	Action* m_actions;
350 	size_t* m_jumpPos;
351 	size_t* m_letters;
352 
353 	TVector<void*> m_pool;
354 	TVector< TVector<unsigned> > m_vec, *m_vecptr;
355 
356 	// Only used to force Null() call during static initialization, when Null()::n can be
357 	// initialized safely by compilers that don't support thread safe static local vars
358 	// initialization
359 	static const SlowScanner* m_null;
360 
361 	bool need_actions;
362 	TVector<TVector<Action>> m_actionsvec;
363 	static const SlowScanner& Null();
364 
alloc(T * & p,size_t size)365 	template<class T> void alloc(T*& p, size_t size)
366 	{
367 		p = static_cast<T*>(malloc(size * sizeof(T)));
368 		memset(p, 0, size * sizeof(T));
369 		m_pool.push_back(p);
370 	}
371 
Alias(const SlowScanner & s)372 	void Alias(const SlowScanner& s)
373 	{
374 		memcpy(&m, &s.m, sizeof(m));
375 		m_vec.clear();
376 		need_actions = s.need_actions;
377 		m_actionsvec.clear();
378 		m_finals = s.m_finals;
379 		m_jumps = s.m_jumps;
380 		m_actions = s.m_actions;
381 		m_jumpPos = s.m_jumpPos;
382 		m_letters = s.m_letters;
383 		m_vecptr = s.m_vecptr;
384 		m_pool.clear();
385 	}
386 
SetJump(size_t oldState,Char c,size_t newState,unsigned long action)387 	void SetJump(size_t oldState, Char c, size_t newState, unsigned long action)
388 	{
389 		Y_ASSERT(!m_vec.empty());
390 		Y_ASSERT(oldState < m.statesCount);
391 		Y_ASSERT(newState < m.statesCount);
392 
393 		size_t idx = oldState * m.lettersCount + m_letters[c];
394 		m_vec[idx].push_back(newState);
395 		if (need_actions)
396 			m_actionsvec[idx].push_back(action);
397 	}
398 
RemapAction(unsigned long action)399 	unsigned long RemapAction(unsigned long action) { return action; }
400 
SetInitial(size_t state)401 	void SetInitial(size_t state) { m.start = state; }
SetTag(size_t state,ui8 tag)402 	void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); }
403 
FinishBuild()404 	void FinishBuild() {}
405 
Accept()406 	static ypair<const size_t*, const size_t*> Accept()
407 	{
408 		static size_t v[1] = { 0 };
409 
410 		return ymake_pair(v, v + 1);
411 	}
412 
Deny()413 	static ypair<const size_t*, const size_t*> Deny()
414 	{
415 		static size_t v[1] = { 0 };
416 		return ymake_pair(v, v);
417 	}
418 
419 	friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&);
420 };
421 
422 template<>
Compile(size_t distance)423 inline SlowScanner Fsm::Compile(size_t distance) {
424 	return SlowScanner(*this, false, true, distance);
425 }
426 
Null()427 inline const SlowScanner& SlowScanner::Null()
428 {
429 	static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>();
430 	return n;
431 }
432 
433 #ifndef PIRE_DEBUG
434 /// A specialization of Run(), since its state is much heavier than other ones
435 /// and we thus want to avoid copying states.
436 template<>
437 inline void Run<SlowScanner>(const SlowScanner& scanner, SlowScanner::State& state, const char* begin, const char* end)
438 {
439 	SlowScanner::State temp;
440 	scanner.Initialize(temp);
441 
442 	SlowScanner::State* src = &state;
443 	SlowScanner::State* dest = &temp;
444 
445 	for (; begin != end; ++begin) {
446 		scanner.Next(*src, *dest, static_cast<unsigned char>(*begin));
447 		DoSwap(src, dest);
448 	}
449 	if (src != &state)
450 		state = *src;
451 }
452 #endif
453 
454 }
455 
456 
457 #endif
458