1 /*
2  *  digester.cpp
3  *
4  *
5  *  Created by Andreas Vox on 02.06.06.
6  *  Copyright 2006 under GPL2. All rights reserved.
7  *
8  */
9 
10 #include <iostream>
11 #include <vector>
12 #include <map>
13 #include <utility>
14 #include <deque>
15 #include <functional>
16 #include <set>
17 
18 #include "digester.h"
19 #include "automata.h"
20 #include "actions.h"
21 
22 namespace desaxe {
23 
24 typedef std::pair<Xml_string, Action> rule_t;
25 
26 typedef unsigned short token_t;
27 typedef std::vector<token_t> path_t;
28 enum special_token { EMPTY = 0, START = 1, ANY = 2, REPEAT = 3 } ;
29 
30 typedef unsigned short nfa_state_t;
31 
32 struct DFA_State
33 {
34 	unsigned int ID;
35 	std::vector<rule_t> rules;
36 
DFA_Statedesaxe::DFA_State37 	DFA_State() : ID(0) {}
38 };
39 
40 typedef DFA_State* dfa_state_t;
41 
42 
43 /**
44  *   private implementation for Digester.
45  *   uses a DFA to keep track of the currently recognized pattern(s).
46  *   if you have no idea what a (non)deterministic finite automaton is you
47  *   probably don't want to read the rest of this code.
48  */
49 
50 class RuleState
51 {
52 public:
53 	RuleState();
54 	RuleState(const RuleState& other);
55 	~RuleState();
56 	/// adds a rule to the list
57 	void addRule(const Xml_string& pattern, Action action);
58 	/// sets the pattern recognizing automaton to its start state
59 	void reset();
60 	/// returns the rules which apply for the currently recognized pattern
61 	const std::vector<rule_t>& rulesForCurrentState();
62 	/// enters a new state when \c tag is read
63 	void open(const Xml_string& tag);
64 	/// returns to the old state when the close tag is read
65 	void close();
66 	/// diagnostics
67 	void dump();
68 private:
69 	/// user sets these rules
70 	std::vector<rule_t> rules;
71 
72 	/// rule patterns get broken into tokens:
73 	std::map<Xml_string, token_t> tokens;
74 	/// lists the accepting (NFA) states for each rule (same sequence as rules)
75 	std::vector<nfa_state_t> accepting;
76 	/// the dfa corresponding to the rule patterns
77 	automata::DFA<dfa_state_t, token_t> *dfa;
78 	/// stack of dfa states
79 	std::vector<dfa_state_t> stateStack;
80 
81 	/// assigns a unique token to each string
82 	token_t createToken(const Xml_string& tok);
83 	void makeTokens();
84 
85 	/// creates a nondeterministic automaton as base for the DFA
86 	automata::NFA<nfa_state_t, token_t>* createNFA();
87 
88 	/// uses rules and createNFA() to create the DFA and assigns rules to states
89 	void compileDFA();
90 	/// is true after compileDFA has run
91 	bool valid;
92 };
93 
94 
Digester()95 Digester::Digester()
96 {
97 	m_state = new RuleState();
98 	m_result_.ptr = nullptr;
99 	m_result_.type = "";
100 }
101 
102 
~Digester()103 Digester::~Digester() {
104 	delete m_state;
105 	deletePatches(m_patches);
106 }
107 
108 
operator =(const Digester & other)109 Digester& Digester::operator=(const Digester& other)
110 {
111 	if (&other == this)
112 		return *this;
113 	delete m_state;
114 	m_state = new RuleState(*other.m_state);
115 	m_objects = other.m_objects;
116 	m_storage = other.m_storage;
117 	m_result_ = other.m_result_;
118 	m_errors = other.m_errors;
119 	return *this;
120 }
121 
122 
nrOfErrors() const123 int Digester::nrOfErrors() const
124 {
125 	return m_errors.size();
126 }
127 
getError(int i) const128 const Xml_string& Digester::getError(int i) const
129 {
130 	return m_errors[i];
131 }
132 
133 
addRule(const Xml_string & pattern,Action action)134 void Digester::addRule(const Xml_string& pattern, Action action)
135 {
136 	action.setDigester(this);
137 	m_state->addRule(pattern, action);
138 }
139 
reset()140 void Digester::reset()
141 {
142 	m_objects.clear();
143 	m_storage.clear();
144 	m_result_.ptr = nullptr;
145 	m_result_.type = "";
146 	m_errors.clear();
147 }
148 
beginDoc()149 void Digester::beginDoc()
150 {
151 	m_state->reset();
152 #ifdef DESAXE_DEBUG
153 	state->dump();
154 #endif
155 }
156 
endDoc()157 void Digester::endDoc()
158 {
159 }
160 
begin(const Xml_string & tag,Xml_attr attr)161 void Digester::begin(const Xml_string& tag, Xml_attr attr)
162 {
163 	m_state->open(tag);
164 	const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
165 	std::vector<rule_t>::const_iterator it;
166 	for(it=rules.begin(); it!=rules.end(); ++it)
167 	{
168 #ifdef DESAXE_DEBUG
169 		std::cerr << "B " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
170 #endif
171 		const_cast<Action&>(it->second).begin(tag, attr);
172 	}
173 }
174 
end(const Xml_string & tag)175 void Digester::end(const Xml_string& tag)
176 {
177 	const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
178 	std::vector<rule_t>::const_reverse_iterator it;
179 	for(it=rules.rbegin(); it!=rules.rend(); ++it)
180 	{
181 #ifdef DESAXE_DEBUG
182 		std::cerr << "E " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
183 #endif
184 		const_cast<Action&>(it->second).end(tag);
185 	}
186 	m_state->close();
187 }
188 
chars(const Xml_string & text)189 void Digester::chars(const Xml_string& text)
190 {
191 	const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
192 	std::vector<rule_t>::const_iterator it;
193 	for(it=rules.begin(); it!=rules.end(); ++it)
194 	{
195 #ifdef DESAXE_DEBUG
196 		std::cerr << "C " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
197 #endif
198 		const_cast<Action&>(it->second).chars(text);
199 	}
200 }
201 
202 
concat(const Xml_string & pattern1,const Xml_string & pattern2)203 Xml_string Digester::concat(const Xml_string& pattern1, const Xml_string& pattern2)
204 {
205 	if (pattern1.isEmpty())
206 		return pattern2;
207 	if (pattern2.isEmpty())
208 		return pattern1;
209 	if ( (pattern1[pattern1.length()-1] != '/') && (pattern2[0] != '/') )
210 		// insert "/" as separator
211 		return pattern1 + "/" + pattern2;
212 	if ( (pattern1[pattern1.length()-1]=='/') || (pattern2[0]=='/') )
213 		return pattern1 + pattern2;
214 	// cut off one "/"
215 	return pattern1 + QString::fromStdString(std::string(static_cast<const std::string&>(pattern2.toStdString()), 1, std::string::npos));
216 }
217 
218 
RuleState()219 RuleState::RuleState() : dfa(nullptr), valid(false)
220 {}
221 
RuleState(const RuleState & other)222 RuleState::RuleState(const RuleState& other) : rules(other.rules), dfa(nullptr), valid(false)
223 {}
224 
~RuleState()225 RuleState::~RuleState()
226 {
227 	if (dfa)
228 	{
229 		std::set<dfa_state_t> morituri(dfa->states());
230 		for (auto i = morituri.begin(); i != morituri.end(); ++i)
231 			delete *i;
232 		delete dfa;
233 	}
234 }
235 
addRule(const Xml_string & pattern,Action action)236 void RuleState::addRule(const Xml_string& pattern, Action action)
237 {
238 	rules.push_back(std::pair<Xml_string, Action>(pattern,action));
239 	valid = false;
240 }
241 
242 
243 inline
reset()244 void RuleState::reset()
245 {
246 	std::vector<rule_t>::iterator it;
247 	for (it = rules.begin(); it != rules.end(); ++it)
248 		it->second.reset();
249 	stateStack.clear();
250 	if (!valid)
251 		compileDFA();
252 	stateStack.push_back(dfa->start());
253 }
254 
dump()255 void RuleState::dump()
256 {
257 	std::cout << "Rules:\n";
258 	for (unsigned int r=0; r<rules.size(); ++r) {
259 		std::cout << r << ":\t\"" << rules[r].first.toStdString() << "\" accepted in  (" << accepting[r] << ")\n";
260 	}
261 	std::cout << "\nTokens:\n";
262 	for (std::map<Xml_string, token_t>::iterator it=tokens.begin(); it!=tokens.end(); ++it) {
263 		std::cout << it->first.toStdString() << ":\t--> " << it->second << "\n";
264 	}
265 	std::cout << "\nAutomaton:\n";
266 	const std::set<dfa_state_t>& states(dfa->states());
267 	const std::set<token_t>& inputs(dfa->inputs());
268 	std::cout << "STATE";
269 	for (std::set<token_t>::const_iterator i=inputs.begin(); i != inputs.end(); ++i) {
270 		std::cout << "\t" << *i;
271 	}
272 	std::cout << "\tRULES\n\n";
273 	for (std::set<dfa_state_t>::const_iterator s=states.begin(); s!=states.end(); ++s) {
274 		std::cout << (*s)->ID;
275 		for (std::set<token_t>::const_iterator i=inputs.begin(); i!=inputs.end(); ++i) {
276 			dfa_state_t nstate = dfa->next(*s,*i);
277 			std::cout << "\t";
278 			if (nstate)
279 				std::cout << nstate->ID;
280 			else
281 				std::cout << "--";
282 		}
283 		for (std::vector<rule_t>::iterator rs=(*s)->rules.begin(); rs!=(*s)->rules.end(); ++rs)
284 			std::cout << "\t\"" << rs->first.toStdString() << "\"";
285 		std::cout << "\n";
286 	}
287 }
288 
289 inline
open(const Xml_string & tag)290 void RuleState::open(const Xml_string& tag)
291 {
292 	dfa_state_t nstate = dfa->next(stateStack.back(), tokens[tag]);
293 	assert(nstate != nullptr);
294 #ifdef DESAXE_DEBUG
295 	std::cerr << "to state " << nstate->ID << "\n";
296 #endif
297 	if (nstate->ID == dfa->deflt()->ID) {
298 		nstate = dfa->next(stateStack.back(), ANY);
299 #ifdef DESAXE_DEBUG
300 		std::cerr << "empty, trying any:" << nstate->ID << "\n";
301 #endif
302 	}
303 	stateStack.push_back(nstate);
304 }
305 
306 inline
close()307 void RuleState::close()
308 {
309 	stateStack.pop_back();
310 }
311 
312 
313 inline
rulesForCurrentState()314 const std::vector<rule_t>& RuleState::rulesForCurrentState()
315 {
316 	return stateStack.back()->rules;
317 }
318 
makeTokens()319 void RuleState::makeTokens()
320 {
321 	tokens.clear();
322 	tokens[""] = EMPTY;
323 	tokens["/"] = START;
324 	tokens["*"] = ANY;
325 	tokens["**"] = REPEAT;
326 }
327 
328 
329 
createToken(const Xml_string & tok)330 token_t RuleState::createToken(const Xml_string& tok)
331 {
332 	if (tokens.find(tok) == tokens.end())
333 		tokens[tok] = tokens.size() + 1;
334 	return tokens[tok];
335 }
336 
337 
338 
createNFA()339 automata::NFA<nfa_state_t, token_t>* RuleState::createNFA()
340 {
341 	using automata::NFA;
342 
343 	/// maps paths uniquely to an nfa_state
344 	std::map<path_t, nfa_state_t> nfa_states;
345 	nfa_states.clear();
346 
347 	path_t prefix;
348 
349 	// START and EMPTY are both: tokens and nfa_states
350 	prefix.push_back(START);
351 	nfa_states[prefix] = START;
352 
353 	prefix[0] = EMPTY;
354 	nfa_states[prefix] = EMPTY;
355 
356 	std::set<nfa_state_t> deflt;
357 	deflt.insert(EMPTY);
358 
359 	NFA<nfa_state_t, token_t> *nfa = new NFA<nfa_state_t, token_t>(START, deflt);
360 
361 	nfa->addState(EMPTY);
362 	nfa->addInput(ANY);
363 	nfa->addTransition(EMPTY, ANY, EMPTY);
364 
365 	for (unsigned int i = 0; i < rules.size(); ++i)
366 	{
367 		const std::string currPattern(fromXMLString(rules[i].first));
368 		const unsigned int len = currPattern.length();
369 		int pos;
370 		nfa_state_t lastState;
371 
372 		// determine if this is a start pattern
373 		prefix.resize(1);
374 		if (currPattern[0] == '/') {
375 			prefix[0] = START;
376 			pos = 1;
377 			lastState = START;
378 		}
379 		else {
380 			prefix[0] = EMPTY;
381 			pos = 0;
382 			lastState = EMPTY;
383 		}
384 
385 //		std::cerr << "looking at pattern: " << currPattern << "\n";
386 		// for all prefixes
387 		do {
388 			std::string::size_type pos2 = currPattern.find('/', pos);
389 			if (pos2 == std::string::npos)
390 				pos2 = len;
391 
392 			std::string diff(currPattern.substr(pos, pos2-pos));
393 			token_t tok = createToken(fromSTLString(diff));
394 //			std::cerr << pos << "-" << pos2 << "\t: " << diff << " = " << tok << "\n";
395 
396 			// create loop if REPEAT token
397 			if (tok == REPEAT) {
398 				nfa->addTransition(lastState, ANY, lastState); //FIXME: that's wrong, need to create repeating state
399 //				std::cerr << "T " << lastState << "--*-->" << lastState << "\n";
400 				pos = pos2 + 1;
401 				continue;
402 			}
403 
404 			prefix.push_back(tok);
405 			// create new state if necessary
406 			nfa_state_t nstate;
407 			if (nfa_states.find(prefix) != nfa_states.end()) {
408 				nstate = nfa_states[prefix];
409 			}
410 			else {
411 				nstate = nfa_states.size();
412 				nfa->addState(nstate);
413 				nfa_states[prefix] = nstate;
414 			}
415 			// add transition
416 			nfa->addInput(tok);
417 			nfa->addTransition(lastState, tok, nstate);
418 //			std::cerr << "T " << lastState << "--(" << tok << ")-->" << nstate << "\n";
419 			lastState = nstate;
420 			pos = pos2 + 1;
421 		} while(pos < signed(len));
422 		accepting.push_back(lastState);
423 //		std::cerr << "accepted in " << lastState << "\n";
424 
425 		// copy all transition from EMPTY to all other states
426 		const NFA<nfa_state_t, token_t>::Transitions& transFromEmpty(nfa->transitions(EMPTY));
427 		std::set<nfa_state_t>::const_iterator it, st;
428 		NFA<nfa_state_t, token_t>::Transitions::const_iterator tr;
429 		for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
430 			if (*it == EMPTY)
431 				continue;
432 			for (tr = transFromEmpty.begin(); tr != transFromEmpty.end(); ++tr)
433 				for (st = tr->second.begin(); st != tr->second.end(); ++st)
434 					nfa->addTransition(*it, tr->first, *st);
435 		}
436 
437 		// ANY transitions
438 		const std::set<token_t>& inputs(nfa->inputs());
439 		std::set<token_t>::const_iterator tok;
440 		for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
441 			const std::set<nfa_state_t>& anyStates(nfa->next(*it, ANY));
442 			for (st = anyStates.begin(); st != anyStates.end(); ++st)
443 				for (tok=inputs.begin(); tok != inputs.end(); ++tok)
444 					if (*tok != ANY)
445 						nfa->addTransition(*it, *tok, *st);
446 		}
447 	}
448 	return nfa;
449 }
450 
451 struct CreateDFAState : public std::unary_function <std::set<nfa_state_t>, dfa_state_t> {
452 
CreateDFAStatedesaxe::CreateDFAState453 	CreateDFAState(const std::vector<rule_t>& rules, const std::vector<nfa_state_t>& accepting)
454 	: n(0), rules_(rules), accepting_(accepting)
455 	{}
456 
operator ()desaxe::CreateDFAState457 	dfa_state_t operator()(const std::set<nfa_state_t>& combination)
458 	{
459 		dfa_state_t result = new DFA_State();
460 		result->ID = n++;
461 		result->rules.clear();
462 		for (unsigned int i=0; i < rules_.size(); ++i)
463 			if (combination.find(accepting_[i]) != combination.end())
464 				result->rules.push_back(rules_[i]);
465 		return result;
466 	}
467 
468 	unsigned int n;
469 	const std::vector<rule_t>& rules_;
470 	const std::vector<nfa_state_t>& accepting_;
471 };
472 
473 //dfa_state_t create_fun(const std::set<nfa_state_t>& combination);
474 
compileDFA()475 void RuleState::compileDFA()
476 {
477 	// build NFA:
478 	makeTokens();
479 	automata::NFA<nfa_state_t, token_t> *nfa = createNFA();
480 	// make deterministic:
481 	CreateDFAState create(rules, accepting);
482 //	dfa = nfa->constructDFA<dfa_state_t>(create);        // wont work
483 //	dfa = nfa->constructDFA<dfa_state_t>(create_fun);    // works
484 //	std::set<nfa_state_t> arg;
485 //	dfa_state_t res = create(arg);                       // works
486 //	create("bullshit");                                  // never works :-)
487 	dfa = nfa->constructDFA<dfa_state_t,CreateDFAState&>(create);   // finally
488 	delete nfa;
489 	valid = true;
490 }
491 
492 } // namespace
493