1 /*
2 * slow.h -- definition of the SlowScanner
3 *
4 * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5 * Alexander Gololobov <agololobov@gmail.com>
6 *
7 * This file is part of Pire, the Perl Incompatible
8 * Regular Expressions library.
9 *
10 * Pire is free software: you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser Public License as published by
12 * the Free Software Foundation, either version 3 of the License, or
13 * (at your option) any later version.
14 *
15 * Pire is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser Public License for more details.
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Pire. If not, see <http://www.gnu.org/licenses>.
21 */
22
23
24 #ifndef PIRE_SCANNERS_SLOW_H
25 #define PIRE_SCANNERS_SLOW_H
26
27 #include "common.h"
28 #include "../approx_matching.h"
29 #include "../stub/stl.h"
30 #include "../partition.h"
31 #include "../vbitset.h"
32 #include "../fsm.h"
33 #include "../run.h"
34 #include "../stub/saveload.h"
35
36 #ifdef PIRE_DEBUG
37 #include <iostream>
38 #include "../stub/lexical_cast.h"
39 #endif
40
41 namespace Pire {
42
43 /**
44 * A 'slow' scanner.
45 * Takes O( str.length() * this->m_states.size() ) time to scan string,
46 * but does not require FSM to be deterministic.
47 * Thus can be used to handle something sorta /x.{40}$/,
48 * where deterministic FSM contains 2^40 states and hence cannot fit
49 * in memory.
50 */
51 class SlowScanner {
52 public:
53 typedef size_t Transition;
54 typedef ui16 Letter;
55 typedef ui32 Action;
56 typedef ui8 Tag;
57
58 enum {
59 FinalFlag = 1,
60 DeadFlag = 0
61 };
62
63 struct State {
64 TVector<unsigned> states;
65 BitSet flags;
66
StateState67 State() {}
StateState68 State(size_t size): flags(size) { states.reserve(size); }
SwapState69 void Swap(State& s) { states.swap(s.states); flags.Swap(s.flags); }
70
71 #ifdef PIRE_DEBUG
72 friend yostream& operator << (yostream& stream, const State& state) { return stream << Join(state.states.begin(), state.states.end(), ", "); }
73 #endif
74 };
75
76 SlowScanner(bool needActions = false) {
77 Alias(Null());
78 need_actions = needActions;
79 }
80
GetLettersCount()81 size_t GetLettersCount() const {return m.lettersCount; };
82
Size()83 size_t Size() const { return GetSize(); }
GetSize()84 size_t GetSize() const { return m.statesCount; }
Empty()85 bool Empty() const { return m_finals == Null().m_finals; }
86
Id()87 size_t Id() const {return (size_t) -1;}
RegexpsCount()88 size_t RegexpsCount() const { return Empty() ? 0 : 1; }
89
Initialize(State & state)90 void Initialize(State& state) const
91 {
92 state.states.clear();
93 state.states.reserve(m.statesCount);
94 state.states.push_back(m.start);
95 BitSet(m.statesCount).Swap(state.flags);
96 }
97
Translate(Char ch)98 Char Translate(Char ch) const
99 {
100 return m_letters[static_cast<size_t>(ch)];
101 }
102
NextTranslated(const State & current,State & next,Char l)103 Action NextTranslated(const State& current, State& next, Char l) const
104 {
105 next.flags.Clear();
106 next.states.clear();
107 for (auto&& state : current.states) {
108 const unsigned* begin = 0;
109 const unsigned* end = 0;
110 if (!m_vecptr) {
111 const size_t* pos = m_jumpPos + state * m.lettersCount + l;
112 begin = m_jumps + pos[0];
113 end = m_jumps + pos[1];
114 } else {
115 const auto& v = (*m_vecptr)[state * m.lettersCount + l];
116 if (!v.empty()) {
117 begin = &v[0];
118 end = &v[0] + v.size();
119 }
120 }
121
122 for (; begin != end; ++begin)
123 if (!next.flags.Test(*begin)) {
124 next.flags.Set(*begin);
125 next.states.push_back(*begin);
126 }
127 }
128
129 return 0;
130 }
131
Next(const State & current,State & next,Char c)132 Action Next(const State& current, State& next, Char c) const
133 {
134 return NextTranslated(current, next, Translate(c));
135 }
136
TakeAction(State &,Action)137 bool TakeAction(State&, Action) const { return false; }
138
NextTranslated(State & s,Char l)139 Action NextTranslated(State& s, Char l) const
140 {
141 State dest(m.statesCount);
142 Action a = NextTranslated(s, dest, l);
143 s.Swap(dest);
144 return a;
145 }
146
Next(State & s,Char c)147 Action Next(State& s, Char c) const
148 {
149 return NextTranslated(s, Translate(c));
150 }
151
Final(const State & s)152 bool Final(const State& s) const
153 {
154 for (auto&& state : s.states)
155 if (m_finals[state])
156 return true;
157 return false;
158 }
159
Dead(const State &)160 bool Dead(const State&) const
161 {
162 return false;
163 }
164
AcceptedRegexps(const State & s)165 ypair<const size_t*, const size_t*> AcceptedRegexps(const State& s) const {
166 return Final(s) ? Accept() : Deny();
167 }
168
CanStop(const State & s)169 bool CanStop(const State& s) const {
170 return Final(s);
171 }
172
Mmap(const void * ptr,size_t size)173 const void* Mmap(const void* ptr, size_t size)
174 {
175 Impl::CheckAlign(ptr);
176 SlowScanner s;
177 const size_t* p = reinterpret_cast<const size_t*>(ptr);
178
179 Impl::ValidateHeader(p, size, ScannerIOTypes::SlowScanner, sizeof(s.m));
180 Locals* locals;
181 Impl::MapPtr(locals, 1, p, size);
182 memcpy(&s.m, locals, sizeof(s.m));
183
184 bool empty = *((const bool*) p);
185 Impl::AdvancePtr(p, size, sizeof(empty));
186 Impl::AlignPtr(p, size);
187
188 if (empty)
189 s.Alias(Null());
190 else {
191 s.m_vecptr = 0;
192 Impl::MapPtr(s.m_letters, MaxChar, p, size);
193 Impl::MapPtr(s.m_finals, s.m.statesCount, p, size);
194 Impl::MapPtr(s.m_jumpPos, s.m.statesCount * s.m.lettersCount + 1, p, size);
195 Impl::MapPtr(s.m_jumps, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
196 if (need_actions)
197 Impl::MapPtr(s.m_actions, s.m_jumpPos[s.m.statesCount * s.m.lettersCount], p, size);
198 Swap(s);
199 }
200 return (const void*) p;
201 }
202
Swap(SlowScanner & s)203 void Swap(SlowScanner& s)
204 {
205 DoSwap(m_finals, s.m_finals);
206 DoSwap(m_jumps, s.m_jumps);
207 DoSwap(m_actions, s.m_actions);
208 DoSwap(m_jumpPos, s.m_jumpPos);
209 DoSwap(m.statesCount, s.m.statesCount);
210 DoSwap(m.lettersCount, s.m.lettersCount);
211 DoSwap(m.start, s.m.start);
212 DoSwap(m_letters, s.m_letters);
213 DoSwap(m_pool, s.m_pool);
214 DoSwap(m_vec, s.m_vec);
215
216 DoSwap(m_vecptr, s.m_vecptr);
217 DoSwap(need_actions, s.need_actions);
218 DoSwap(m_actionsvec, s.m_actionsvec);
219 if (m_vecptr == &s.m_vec)
220 m_vecptr = &m_vec;
221 if (s.m_vecptr == &m_vec)
222 s.m_vecptr = &s.m_vec;
223 }
224
SlowScanner(const SlowScanner & s)225 SlowScanner(const SlowScanner& s)
226 : m(s.m)
227 , m_vec(s.m_vec)
228 , need_actions(s.need_actions)
229 , m_actionsvec(s.m_actionsvec)
230 {
231 if (s.m_vec.empty()) {
232 // Empty or mmap()-ed scanner, just copy pointers
233 m_finals = s.m_finals;
234 m_jumps = s.m_jumps;
235 m_actions = s.m_actions;
236 m_jumpPos = s.m_jumpPos;
237 m_letters = s.m_letters;
238 m_vecptr = 0;
239 } else {
240 // In-memory scanner, perform deep copy
241 alloc(m_letters, MaxChar);
242 memcpy(m_letters, s.m_letters, sizeof(*m_letters) * MaxChar);
243 m_jumps = 0;
244 m_jumpPos = 0;
245 m_actions = 0;
246 alloc(m_finals, m.statesCount);
247 memcpy(m_finals, s.m_finals, sizeof(*m_finals) * m.statesCount);
248 m_vecptr = &m_vec;
249 }
250 }
251
252 explicit SlowScanner(Fsm& fsm, bool needActions = false, bool removeEpsilons = true, size_t distance = 0)
need_actions(needActions)253 : need_actions(needActions)
254 {
255 if (distance) {
256 fsm = CreateApproxFsm(fsm, distance);
257 }
258 if (removeEpsilons)
259 fsm.RemoveEpsilons();
260 fsm.Sparse(!removeEpsilons);
261
262 m.statesCount = fsm.Size();
263 m.lettersCount = fsm.Letters().Size();
264
265 m_vec.resize(m.statesCount * m.lettersCount);
266 if (need_actions)
267 m_actionsvec.resize(m.statesCount * m.lettersCount);
268 m_vecptr = &m_vec;
269 alloc(m_letters, MaxChar);
270 m_jumps = 0;
271 m_actions = 0;
272 m_jumpPos = 0;
273 alloc(m_finals, m.statesCount);
274
275 // Build letter translation table
276 Fill(m_letters, m_letters + MaxChar, 0);
277 for (auto&& letter : fsm.Letters())
278 for (auto&& character : letter.second.second)
279 m_letters[character] = letter.second.first;
280
281 m.start = fsm.Initial();
282 BuildScanner(fsm, *this);
283 }
284
285 SlowScanner& operator = (const SlowScanner& s) { SlowScanner(s).Swap(*this); return *this; }
286
~SlowScanner()287 ~SlowScanner()
288 {
289 for (auto&& i : m_pool)
290 free(i);
291 }
292
293 void Save(yostream*) const;
294 void Load(yistream*);
295
StateIndex(const State & s)296 const State& StateIndex(const State& s) const { return s; }
297
298 protected:
IsMmaped()299 bool IsMmaped() const
300 {
301 return (!m_vecptr);
302 }
303
GetJump(size_t pos)304 size_t GetJump(size_t pos) const
305 {
306 return m_jumps[pos];
307 }
308
GetAction(size_t pos)309 Action& GetAction(size_t pos) const
310 {
311 return m_actions[pos];
312 }
313
GetActionsVec(size_t from)314 const TVector<Action>& GetActionsVec(size_t from) const
315 {
316 return m_actionsvec[from];
317 }
318
GetJumpsVec(size_t from)319 const TVector<unsigned int>& GetJumpsVec(size_t from) const
320 {
321 return m_vec[from];
322 }
323
GetJumpPos()324 size_t* GetJumpPos() const
325 {
326 return m_jumpPos;
327 }
328
GetStart()329 size_t GetStart() const
330 {
331 return m.start;
332 }
333
IsFinal(size_t pos)334 bool IsFinal(size_t pos) const
335 {
336 return m_finals[pos];
337 }
338
339 private:
340
341 struct Locals {
342 size_t statesCount;
343 size_t lettersCount;
344 size_t start;
345 } m;
346
347 bool* m_finals;
348 unsigned* m_jumps;
349 Action* m_actions;
350 size_t* m_jumpPos;
351 size_t* m_letters;
352
353 TVector<void*> m_pool;
354 TVector< TVector<unsigned> > m_vec, *m_vecptr;
355
356 // Only used to force Null() call during static initialization, when Null()::n can be
357 // initialized safely by compilers that don't support thread safe static local vars
358 // initialization
359 static const SlowScanner* m_null;
360
361 bool need_actions;
362 TVector<TVector<Action>> m_actionsvec;
363 static const SlowScanner& Null();
364
alloc(T * & p,size_t size)365 template<class T> void alloc(T*& p, size_t size)
366 {
367 p = static_cast<T*>(malloc(size * sizeof(T)));
368 memset(p, 0, size * sizeof(T));
369 m_pool.push_back(p);
370 }
371
Alias(const SlowScanner & s)372 void Alias(const SlowScanner& s)
373 {
374 memcpy(&m, &s.m, sizeof(m));
375 m_vec.clear();
376 need_actions = s.need_actions;
377 m_actionsvec.clear();
378 m_finals = s.m_finals;
379 m_jumps = s.m_jumps;
380 m_actions = s.m_actions;
381 m_jumpPos = s.m_jumpPos;
382 m_letters = s.m_letters;
383 m_vecptr = s.m_vecptr;
384 m_pool.clear();
385 }
386
SetJump(size_t oldState,Char c,size_t newState,unsigned long action)387 void SetJump(size_t oldState, Char c, size_t newState, unsigned long action)
388 {
389 Y_ASSERT(!m_vec.empty());
390 Y_ASSERT(oldState < m.statesCount);
391 Y_ASSERT(newState < m.statesCount);
392
393 size_t idx = oldState * m.lettersCount + m_letters[c];
394 m_vec[idx].push_back(newState);
395 if (need_actions)
396 m_actionsvec[idx].push_back(action);
397 }
398
RemapAction(unsigned long action)399 unsigned long RemapAction(unsigned long action) { return action; }
400
SetInitial(size_t state)401 void SetInitial(size_t state) { m.start = state; }
SetTag(size_t state,ui8 tag)402 void SetTag(size_t state, ui8 tag) { m_finals[state] = (tag != 0); }
403
FinishBuild()404 void FinishBuild() {}
405
Accept()406 static ypair<const size_t*, const size_t*> Accept()
407 {
408 static size_t v[1] = { 0 };
409
410 return ymake_pair(v, v + 1);
411 }
412
Deny()413 static ypair<const size_t*, const size_t*> Deny()
414 {
415 static size_t v[1] = { 0 };
416 return ymake_pair(v, v);
417 }
418
419 friend void BuildScanner<SlowScanner>(const Fsm&, SlowScanner&);
420 };
421
422 template<>
Compile(size_t distance)423 inline SlowScanner Fsm::Compile(size_t distance) {
424 return SlowScanner(*this, false, true, distance);
425 }
426
Null()427 inline const SlowScanner& SlowScanner::Null()
428 {
429 static const SlowScanner n = Fsm::MakeFalse().Compile<SlowScanner>();
430 return n;
431 }
432
433 #ifndef PIRE_DEBUG
434 /// A specialization of Run(), since its state is much heavier than other ones
435 /// and we thus want to avoid copying states.
436 template<>
437 inline void Run<SlowScanner>(const SlowScanner& scanner, SlowScanner::State& state, const char* begin, const char* end)
438 {
439 SlowScanner::State temp;
440 scanner.Initialize(temp);
441
442 SlowScanner::State* src = &state;
443 SlowScanner::State* dest = &temp;
444
445 for (; begin != end; ++begin) {
446 scanner.Next(*src, *dest, static_cast<unsigned char>(*begin));
447 DoSwap(src, dest);
448 }
449 if (src != &state)
450 state = *src;
451 }
452 #endif
453
454 }
455
456
457 #endif
458