1 /*
2  * loaded.h -- a definition of the LoadedScanner
3  *
4  * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5  *                          Alexander Gololobov <agololobov@gmail.com>
6  *
7  * This file is part of Pire, the Perl Incompatible
8  * Regular Expressions library.
9  *
10  * Pire is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * Pire is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser Public License for more details.
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Pire.  If not, see <http://www.gnu.org/licenses>.
21  */
22 
23 
24 #ifndef PIRE_SCANNERS_LOADED_H
25 #define PIRE_SCANNERS_LOADED_H
26 
27 #include <string.h>
28 #include "common.h"
29 #include "../approx_matching.h"
30 #include "../fsm.h"
31 #include "../partition.h"
32 
33 #ifdef PIRE_DEBUG
34 #include <iostream>
35 #endif
36 
37 namespace Pire {
38 
39 /**
40 * A loaded scanner -- the deterministic scanner having actions
41 *                     associated with states and transitions
42 *
43 * Not a complete scanner itself (hence abstract), this class provides
44 * infrastructure for regexp-based algorithms (e.g. counts or captures),
45 * supporting major part of scanner construction, (de)serialization,
46 * mmap()-ing, etc.
47 *
48 * It is a good idea to override copy ctor, operator= and swap()
49 * in subclasses to avoid mixing different scanner types in these methods.
50 * Also please note that subclasses should not have any data members of thier own.
51 */
52 class LoadedScanner {
53 public:
54 	typedef ui8         Letter;
55 	typedef ui32        Action;
56 	typedef ui8         Tag;
57 
58 	typedef size_t InternalState;
59 
60 	union Transition {
61 		size_t raw;			// alignment hint for compiler
62 		struct {
63 			ui32 shift;
64 			Action action;
65 		};
66 	};
67 
68 	// Override in subclass, if neccessary
69 	enum {
70 		FinalFlag = 0,
71 		DeadFlag  = 0
72 	};
73 
74 	static const size_t MAX_RE_COUNT = 16;
75 
76 protected:
LoadedScanner()77 	LoadedScanner() { Alias(Null()); }
78 
LoadedScanner(const LoadedScanner & s)79 	LoadedScanner(const LoadedScanner& s): m(s.m)
80 	{
81 		if (s.m_buffer) {
82 			m_buffer = BufferType(new char [BufSize()]);
83 			memcpy(m_buffer.get(), s.m_buffer.get(), BufSize());
84 			Markup(m_buffer.get());
85 			m.initial = (InternalState)m_jumps + (s.m.initial - (InternalState)s.m_jumps);
86 		} else {
87 			Alias(s);
88 		}
89 	}
90 
Swap(LoadedScanner & s)91 	void Swap(LoadedScanner& s)
92 	{
93 		DoSwap(m_buffer, s.m_buffer);
94 		DoSwap(m.statesCount, s.m.statesCount);
95 		DoSwap(m.lettersCount, s.m.lettersCount);
96 		DoSwap(m.regexpsCount, s.m.regexpsCount);
97 		DoSwap(m.initial, s.m.initial);
98 		DoSwap(m_letters, s.m_letters);
99 		DoSwap(m_jumps, s.m_jumps);
100 		DoSwap(m_tags, s.m_tags);
101 	}
102 
103 	LoadedScanner& operator = (const LoadedScanner& s) { LoadedScanner(s).Swap(*this); return *this; }
LoadedScanner(LoadedScanner && other)104 	LoadedScanner (LoadedScanner&& other) : LoadedScanner() {
105 		Swap(other);
106 	}
107 	LoadedScanner& operator=(LoadedScanner&& other) {
108 		Swap(other);
109 		return *this;
110 	}
111 
112 public:
Size()113 	size_t Size() const { return m.statesCount; }
114 
Empty()115 	bool Empty() const { return m_jumps == Null().m_jumps; }
116 
RegexpsCount()117 	size_t RegexpsCount() const { return Empty() ? 0 : m.regexpsCount; }
118 
LettersCount()119 	size_t LettersCount() const { return m.lettersCount; }
120 
Mmap(const void * ptr,size_t size)121 	const void* Mmap(const void* ptr, size_t size) {
122 		return Mmap(ptr, size, nullptr);
123 	}
124 
Mmap(const void * ptr,size_t size,ui32 * type)125 	const void* Mmap(const void* ptr, size_t size, ui32* type)
126 	{
127 		Impl::CheckAlign(ptr);
128 		LoadedScanner s;
129 		const size_t* p = reinterpret_cast<const size_t*>(ptr);
130 		Header header = Impl::ValidateHeader(p, size, ScannerIOTypes::LoadedScanner, sizeof(s.m));
131 		if (type) {
132 			*type = header.Type;
133 		}
134 
135 		Locals* locals;
136 		Impl::MapPtr(locals, 1, p, size);
137 		memcpy(&s.m, locals, sizeof(s.m));
138 
139 		Impl::MapPtr(s.m_letters, MaxChar, p, size);
140 		Impl::MapPtr(s.m_jumps, s.m.statesCount * s.m.lettersCount, p, size);
141 		if (header.Version == Header::RE_VERSION_WITH_MACTIONS) {
142 			Action* actions = 0;
143 			Impl::MapPtr(actions, s.m.statesCount * s.m.lettersCount, p, size);
144 		}
145 		Impl::MapPtr(s.m_tags, s.m.statesCount, p, size);
146 
147 		s.m.initial += reinterpret_cast<size_t>(s.m_jumps);
148 		Swap(s);
149 
150 		return (const void*) p;
151 	}
152 
153 	void Save(yostream*, ui32 type) const;
154 	void Save(yostream*) const;
155 	void Load(yistream*, ui32* type);
156 	void Load(yistream*);
157 
158 		template<class Eq>
159 	void Init(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
160 	{
161 		m.statesCount = states;
162 		m.lettersCount = letters.Size();
163 		m.regexpsCount = regexpsCount;
164 		m_buffer = BufferType(new char[BufSize()]);
165 		memset(m_buffer.get(), 0, BufSize());
166 		Markup(m_buffer.get());
167 
168 		m.initial = reinterpret_cast<size_t>(m_jumps + startState * m.lettersCount);
169 
170 		// Build letter translation table
171 		Fill(m_letters, m_letters + sizeof(m_letters)/sizeof(*m_letters), 0);
172 		for (auto&& letter : letters)
173 			for (auto&& character : letter.second.second)
174 				m_letters[character] = letter.second.first;
175 	}
176 
StateSize()177 	size_t StateSize() const
178 	{
179 		return m.lettersCount * sizeof(*m_jumps);
180 	}
181 
TransitionIndex(size_t state,Char c)182 	size_t TransitionIndex(size_t state, Char c) const
183 	{
184 		return state * m.lettersCount + m_letters[c];
185 	}
186 
SetJump(size_t oldState,Char c,size_t newState,Action action)187 	void SetJump(size_t oldState, Char c, size_t newState, Action action)
188 	{
189 		Y_ASSERT(m_buffer);
190 		Y_ASSERT(oldState < m.statesCount);
191 		Y_ASSERT(newState < m.statesCount);
192 
193 		size_t shift = (newState - oldState) * StateSize();
194 		Transition tr;
195 		tr.shift = shift;
196 		tr.action = action;
197 		m_jumps[TransitionIndex(oldState, c)] = tr;
198 	}
199 
RemapAction(Action action)200 	Action RemapAction(Action action) { return action; }
201 
SetInitial(size_t state)202 	void SetInitial(size_t state) { Y_ASSERT(m_buffer); m.initial = reinterpret_cast<size_t>(m_jumps + state * m.lettersCount); }
SetTag(size_t state,Tag tag)203 	void SetTag(size_t state, Tag tag) { Y_ASSERT(m_buffer); m_tags[state] = tag; }
FinishBuild()204 	void FinishBuild() {}
205 
StateIdx(InternalState s)206 	size_t StateIdx(InternalState s) const
207 	{
208 		return (reinterpret_cast<Transition*>(s) - m_jumps) / m.lettersCount;
209 	}
210 
SignExtend(i32 i)211 	i64 SignExtend(i32 i) const { return i; }
212 
BufSize()213 	size_t BufSize() const
214 	{
215 		return
216 			MaxChar * sizeof(*m_letters)
217 			+ m.statesCount * StateSize()
218 			+ m.statesCount * sizeof(*m_tags)
219 			;
220 	}
221 
222 protected:
223 
224 	static const Action IncrementMask     = (1 << MAX_RE_COUNT) - 1;
225 	static const Action ResetMask         = IncrementMask << MAX_RE_COUNT;
226 
227 	// TODO: maybe, put fields in private section and provide data accessors
228 
229 	struct Locals {
230 		ui32 statesCount;
231 		ui32 lettersCount;
232 		ui32 regexpsCount;
233 		size_t initial;
234 	} m;
235 
236 	using BufferType = std::unique_ptr<char[]>;
237 	BufferType m_buffer;
238 
239 	Letter* m_letters;
240 	Transition* m_jumps;
241 	Tag* m_tags;
242 
243 	virtual ~LoadedScanner();
244 
245 private:
246 	explicit LoadedScanner(Fsm& fsm, size_t distance = 0)
247 	{
248 		if (distance) {
249 			fsm = CreateApproxFsm(fsm, distance);
250 		}
251 		fsm.Canonize();
252 		Init(fsm.Size(), fsm.Letters(), fsm.Initial());
253 		BuildScanner(fsm, *this);
254 	}
255 
256 	// Only used to force Null() call during static initialization, when Null()::n can be
257 	// initialized safely by compilers that don't support thread safe static local vars
258 	// initialization
259 	static const LoadedScanner* m_null;
260 
Null()261 	inline static const LoadedScanner& Null()
262 	{
263 		static const LoadedScanner n = Fsm::MakeFalse().Compile<LoadedScanner>();
264 		return n;
265 	}
266 
Markup(void * buf)267 	void Markup(void* buf)
268 	{
269 		m_letters = reinterpret_cast<Letter*>(buf);
270 		m_jumps   = reinterpret_cast<Transition*>(m_letters + MaxChar);
271 		m_tags    = reinterpret_cast<Tag*>(m_jumps + m.statesCount * m.lettersCount);
272 	}
273 
Alias(const LoadedScanner & s)274 	void Alias(const LoadedScanner& s)
275 	{
276 		memcpy(&m, &s.m, sizeof(m));
277 		m_buffer = 0;
278 		m_letters = s.m_letters;
279 		m_jumps = s.m_jumps;
280 		m_tags = s.m_tags;
281 	}
282 
283 	template<class Eq>
284 	LoadedScanner(size_t states, const Partition<Char, Eq>& letters, size_t startState, size_t regexpsCount = 1)
285 	{
286 		Init(states, letters, startState, regexpsCount);
287 	}
288 
289 	friend class Fsm;
290 };
291 
292 inline LoadedScanner::~LoadedScanner() = default;
293 
294 }
295 
296 
297 #endif
298