1 /*
2 * digester.cpp
3 *
4 *
5 * Created by Andreas Vox on 02.06.06.
6 * Copyright 2006 under GPL2. All rights reserved.
7 *
8 */
9
10 #include <iostream>
11 #include <vector>
12 #include <map>
13 #include <utility>
14 #include <deque>
15 #include <functional>
16 #include <set>
17
18 #include "digester.h"
19 #include "automata.h"
20 #include "actions.h"
21
22 namespace desaxe {
23
24 typedef std::pair<Xml_string, Action> rule_t;
25
26 typedef unsigned short token_t;
27 typedef std::vector<token_t> path_t;
28 enum special_token { EMPTY = 0, START = 1, ANY = 2, REPEAT = 3 } ;
29
30 typedef unsigned short nfa_state_t;
31
32 struct DFA_State
33 {
34 unsigned int ID;
35 std::vector<rule_t> rules;
36
DFA_Statedesaxe::DFA_State37 DFA_State() : ID(0) {}
38 };
39
40 typedef DFA_State* dfa_state_t;
41
42
43 /**
44 * private implementation for Digester.
45 * uses a DFA to keep track of the currently recognized pattern(s).
46 * if you have no idea what a (non)deterministic finite automaton is you
47 * probably don't want to read the rest of this code.
48 */
49
50 class RuleState
51 {
52 public:
53 RuleState();
54 RuleState(const RuleState& other);
55 ~RuleState();
56 /// adds a rule to the list
57 void addRule(const Xml_string& pattern, Action action);
58 /// sets the pattern recognizing automaton to its start state
59 void reset();
60 /// returns the rules which apply for the currently recognized pattern
61 const std::vector<rule_t>& rulesForCurrentState();
62 /// enters a new state when \c tag is read
63 void open(const Xml_string& tag);
64 /// returns to the old state when the close tag is read
65 void close();
66 /// diagnostics
67 void dump();
68 private:
69 /// user sets these rules
70 std::vector<rule_t> rules;
71
72 /// rule patterns get broken into tokens:
73 std::map<Xml_string, token_t> tokens;
74 /// lists the accepting (NFA) states for each rule (same sequence as rules)
75 std::vector<nfa_state_t> accepting;
76 /// the dfa corresponding to the rule patterns
77 automata::DFA<dfa_state_t, token_t> *dfa;
78 /// stack of dfa states
79 std::vector<dfa_state_t> stateStack;
80
81 /// assigns a unique token to each string
82 token_t createToken(const Xml_string& tok);
83 void makeTokens();
84
85 /// creates a nondeterministic automaton as base for the DFA
86 automata::NFA<nfa_state_t, token_t>* createNFA();
87
88 /// uses rules and createNFA() to create the DFA and assigns rules to states
89 void compileDFA();
90 /// is true after compileDFA has run
91 bool valid;
92 };
93
94
Digester()95 Digester::Digester()
96 {
97 m_state = new RuleState();
98 m_result_.ptr = nullptr;
99 m_result_.type = "";
100 }
101
102
~Digester()103 Digester::~Digester() {
104 delete m_state;
105 deletePatches(m_patches);
106 }
107
108
operator =(const Digester & other)109 Digester& Digester::operator=(const Digester& other)
110 {
111 if (&other == this)
112 return *this;
113 delete m_state;
114 m_state = new RuleState(*other.m_state);
115 m_objects = other.m_objects;
116 m_storage = other.m_storage;
117 m_result_ = other.m_result_;
118 m_errors = other.m_errors;
119 return *this;
120 }
121
122
nrOfErrors() const123 int Digester::nrOfErrors() const
124 {
125 return m_errors.size();
126 }
127
getError(int i) const128 const Xml_string& Digester::getError(int i) const
129 {
130 return m_errors[i];
131 }
132
133
addRule(const Xml_string & pattern,Action action)134 void Digester::addRule(const Xml_string& pattern, Action action)
135 {
136 action.setDigester(this);
137 m_state->addRule(pattern, action);
138 }
139
reset()140 void Digester::reset()
141 {
142 m_objects.clear();
143 m_storage.clear();
144 m_result_.ptr = nullptr;
145 m_result_.type = "";
146 m_errors.clear();
147 }
148
beginDoc()149 void Digester::beginDoc()
150 {
151 m_state->reset();
152 #ifdef DESAXE_DEBUG
153 state->dump();
154 #endif
155 }
156
endDoc()157 void Digester::endDoc()
158 {
159 }
160
begin(const Xml_string & tag,Xml_attr attr)161 void Digester::begin(const Xml_string& tag, Xml_attr attr)
162 {
163 m_state->open(tag);
164 const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
165 std::vector<rule_t>::const_iterator it;
166 for(it=rules.begin(); it!=rules.end(); ++it)
167 {
168 #ifdef DESAXE_DEBUG
169 std::cerr << "B " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
170 #endif
171 const_cast<Action&>(it->second).begin(tag, attr);
172 }
173 }
174
end(const Xml_string & tag)175 void Digester::end(const Xml_string& tag)
176 {
177 const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
178 std::vector<rule_t>::const_reverse_iterator it;
179 for(it=rules.rbegin(); it!=rules.rend(); ++it)
180 {
181 #ifdef DESAXE_DEBUG
182 std::cerr << "E " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
183 #endif
184 const_cast<Action&>(it->second).end(tag);
185 }
186 m_state->close();
187 }
188
chars(const Xml_string & text)189 void Digester::chars(const Xml_string& text)
190 {
191 const std::vector<rule_t>& rules (m_state->rulesForCurrentState());
192 std::vector<rule_t>::const_iterator it;
193 for(it=rules.begin(); it!=rules.end(); ++it)
194 {
195 #ifdef DESAXE_DEBUG
196 std::cerr << "C " << it->first.toStdString() << " " << typeid(it->second).name() << "\n";
197 #endif
198 const_cast<Action&>(it->second).chars(text);
199 }
200 }
201
202
concat(const Xml_string & pattern1,const Xml_string & pattern2)203 Xml_string Digester::concat(const Xml_string& pattern1, const Xml_string& pattern2)
204 {
205 if (pattern1.isEmpty())
206 return pattern2;
207 if (pattern2.isEmpty())
208 return pattern1;
209 if ( (pattern1[pattern1.length()-1] != '/') && (pattern2[0] != '/') )
210 // insert "/" as separator
211 return pattern1 + "/" + pattern2;
212 if ( (pattern1[pattern1.length()-1]=='/') || (pattern2[0]=='/') )
213 return pattern1 + pattern2;
214 // cut off one "/"
215 return pattern1 + QString::fromStdString(std::string(static_cast<const std::string&>(pattern2.toStdString()), 1, std::string::npos));
216 }
217
218
RuleState()219 RuleState::RuleState() : dfa(nullptr), valid(false)
220 {}
221
RuleState(const RuleState & other)222 RuleState::RuleState(const RuleState& other) : rules(other.rules), dfa(nullptr), valid(false)
223 {}
224
~RuleState()225 RuleState::~RuleState()
226 {
227 if (dfa)
228 {
229 std::set<dfa_state_t> morituri(dfa->states());
230 for (auto i = morituri.begin(); i != morituri.end(); ++i)
231 delete *i;
232 delete dfa;
233 }
234 }
235
addRule(const Xml_string & pattern,Action action)236 void RuleState::addRule(const Xml_string& pattern, Action action)
237 {
238 rules.push_back(std::pair<Xml_string, Action>(pattern,action));
239 valid = false;
240 }
241
242
243 inline
reset()244 void RuleState::reset()
245 {
246 std::vector<rule_t>::iterator it;
247 for (it = rules.begin(); it != rules.end(); ++it)
248 it->second.reset();
249 stateStack.clear();
250 if (!valid)
251 compileDFA();
252 stateStack.push_back(dfa->start());
253 }
254
dump()255 void RuleState::dump()
256 {
257 std::cout << "Rules:\n";
258 for (unsigned int r=0; r<rules.size(); ++r) {
259 std::cout << r << ":\t\"" << rules[r].first.toStdString() << "\" accepted in (" << accepting[r] << ")\n";
260 }
261 std::cout << "\nTokens:\n";
262 for (std::map<Xml_string, token_t>::iterator it=tokens.begin(); it!=tokens.end(); ++it) {
263 std::cout << it->first.toStdString() << ":\t--> " << it->second << "\n";
264 }
265 std::cout << "\nAutomaton:\n";
266 const std::set<dfa_state_t>& states(dfa->states());
267 const std::set<token_t>& inputs(dfa->inputs());
268 std::cout << "STATE";
269 for (std::set<token_t>::const_iterator i=inputs.begin(); i != inputs.end(); ++i) {
270 std::cout << "\t" << *i;
271 }
272 std::cout << "\tRULES\n\n";
273 for (std::set<dfa_state_t>::const_iterator s=states.begin(); s!=states.end(); ++s) {
274 std::cout << (*s)->ID;
275 for (std::set<token_t>::const_iterator i=inputs.begin(); i!=inputs.end(); ++i) {
276 dfa_state_t nstate = dfa->next(*s,*i);
277 std::cout << "\t";
278 if (nstate)
279 std::cout << nstate->ID;
280 else
281 std::cout << "--";
282 }
283 for (std::vector<rule_t>::iterator rs=(*s)->rules.begin(); rs!=(*s)->rules.end(); ++rs)
284 std::cout << "\t\"" << rs->first.toStdString() << "\"";
285 std::cout << "\n";
286 }
287 }
288
289 inline
open(const Xml_string & tag)290 void RuleState::open(const Xml_string& tag)
291 {
292 dfa_state_t nstate = dfa->next(stateStack.back(), tokens[tag]);
293 assert(nstate != nullptr);
294 #ifdef DESAXE_DEBUG
295 std::cerr << "to state " << nstate->ID << "\n";
296 #endif
297 if (nstate->ID == dfa->deflt()->ID) {
298 nstate = dfa->next(stateStack.back(), ANY);
299 #ifdef DESAXE_DEBUG
300 std::cerr << "empty, trying any:" << nstate->ID << "\n";
301 #endif
302 }
303 stateStack.push_back(nstate);
304 }
305
306 inline
close()307 void RuleState::close()
308 {
309 stateStack.pop_back();
310 }
311
312
313 inline
rulesForCurrentState()314 const std::vector<rule_t>& RuleState::rulesForCurrentState()
315 {
316 return stateStack.back()->rules;
317 }
318
makeTokens()319 void RuleState::makeTokens()
320 {
321 tokens.clear();
322 tokens[""] = EMPTY;
323 tokens["/"] = START;
324 tokens["*"] = ANY;
325 tokens["**"] = REPEAT;
326 }
327
328
329
createToken(const Xml_string & tok)330 token_t RuleState::createToken(const Xml_string& tok)
331 {
332 if (tokens.find(tok) == tokens.end())
333 tokens[tok] = tokens.size() + 1;
334 return tokens[tok];
335 }
336
337
338
createNFA()339 automata::NFA<nfa_state_t, token_t>* RuleState::createNFA()
340 {
341 using automata::NFA;
342
343 /// maps paths uniquely to an nfa_state
344 std::map<path_t, nfa_state_t> nfa_states;
345 nfa_states.clear();
346
347 path_t prefix;
348
349 // START and EMPTY are both: tokens and nfa_states
350 prefix.push_back(START);
351 nfa_states[prefix] = START;
352
353 prefix[0] = EMPTY;
354 nfa_states[prefix] = EMPTY;
355
356 std::set<nfa_state_t> deflt;
357 deflt.insert(EMPTY);
358
359 NFA<nfa_state_t, token_t> *nfa = new NFA<nfa_state_t, token_t>(START, deflt);
360
361 nfa->addState(EMPTY);
362 nfa->addInput(ANY);
363 nfa->addTransition(EMPTY, ANY, EMPTY);
364
365 for (unsigned int i = 0; i < rules.size(); ++i)
366 {
367 const std::string currPattern(fromXMLString(rules[i].first));
368 const unsigned int len = currPattern.length();
369 int pos;
370 nfa_state_t lastState;
371
372 // determine if this is a start pattern
373 prefix.resize(1);
374 if (currPattern[0] == '/') {
375 prefix[0] = START;
376 pos = 1;
377 lastState = START;
378 }
379 else {
380 prefix[0] = EMPTY;
381 pos = 0;
382 lastState = EMPTY;
383 }
384
385 // std::cerr << "looking at pattern: " << currPattern << "\n";
386 // for all prefixes
387 do {
388 std::string::size_type pos2 = currPattern.find('/', pos);
389 if (pos2 == std::string::npos)
390 pos2 = len;
391
392 std::string diff(currPattern.substr(pos, pos2-pos));
393 token_t tok = createToken(fromSTLString(diff));
394 // std::cerr << pos << "-" << pos2 << "\t: " << diff << " = " << tok << "\n";
395
396 // create loop if REPEAT token
397 if (tok == REPEAT) {
398 nfa->addTransition(lastState, ANY, lastState); //FIXME: that's wrong, need to create repeating state
399 // std::cerr << "T " << lastState << "--*-->" << lastState << "\n";
400 pos = pos2 + 1;
401 continue;
402 }
403
404 prefix.push_back(tok);
405 // create new state if necessary
406 nfa_state_t nstate;
407 if (nfa_states.find(prefix) != nfa_states.end()) {
408 nstate = nfa_states[prefix];
409 }
410 else {
411 nstate = nfa_states.size();
412 nfa->addState(nstate);
413 nfa_states[prefix] = nstate;
414 }
415 // add transition
416 nfa->addInput(tok);
417 nfa->addTransition(lastState, tok, nstate);
418 // std::cerr << "T " << lastState << "--(" << tok << ")-->" << nstate << "\n";
419 lastState = nstate;
420 pos = pos2 + 1;
421 } while(pos < signed(len));
422 accepting.push_back(lastState);
423 // std::cerr << "accepted in " << lastState << "\n";
424
425 // copy all transition from EMPTY to all other states
426 const NFA<nfa_state_t, token_t>::Transitions& transFromEmpty(nfa->transitions(EMPTY));
427 std::set<nfa_state_t>::const_iterator it, st;
428 NFA<nfa_state_t, token_t>::Transitions::const_iterator tr;
429 for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
430 if (*it == EMPTY)
431 continue;
432 for (tr = transFromEmpty.begin(); tr != transFromEmpty.end(); ++tr)
433 for (st = tr->second.begin(); st != tr->second.end(); ++st)
434 nfa->addTransition(*it, tr->first, *st);
435 }
436
437 // ANY transitions
438 const std::set<token_t>& inputs(nfa->inputs());
439 std::set<token_t>::const_iterator tok;
440 for (it = nfa->states().begin(); it != nfa->states().end(); ++it) {
441 const std::set<nfa_state_t>& anyStates(nfa->next(*it, ANY));
442 for (st = anyStates.begin(); st != anyStates.end(); ++st)
443 for (tok=inputs.begin(); tok != inputs.end(); ++tok)
444 if (*tok != ANY)
445 nfa->addTransition(*it, *tok, *st);
446 }
447 }
448 return nfa;
449 }
450
451 struct CreateDFAState : public std::unary_function <std::set<nfa_state_t>, dfa_state_t> {
452
CreateDFAStatedesaxe::CreateDFAState453 CreateDFAState(const std::vector<rule_t>& rules, const std::vector<nfa_state_t>& accepting)
454 : n(0), rules_(rules), accepting_(accepting)
455 {}
456
operator ()desaxe::CreateDFAState457 dfa_state_t operator()(const std::set<nfa_state_t>& combination)
458 {
459 dfa_state_t result = new DFA_State();
460 result->ID = n++;
461 result->rules.clear();
462 for (unsigned int i=0; i < rules_.size(); ++i)
463 if (combination.find(accepting_[i]) != combination.end())
464 result->rules.push_back(rules_[i]);
465 return result;
466 }
467
468 unsigned int n;
469 const std::vector<rule_t>& rules_;
470 const std::vector<nfa_state_t>& accepting_;
471 };
472
473 //dfa_state_t create_fun(const std::set<nfa_state_t>& combination);
474
compileDFA()475 void RuleState::compileDFA()
476 {
477 // build NFA:
478 makeTokens();
479 automata::NFA<nfa_state_t, token_t> *nfa = createNFA();
480 // make deterministic:
481 CreateDFAState create(rules, accepting);
482 // dfa = nfa->constructDFA<dfa_state_t>(create); // wont work
483 // dfa = nfa->constructDFA<dfa_state_t>(create_fun); // works
484 // std::set<nfa_state_t> arg;
485 // dfa_state_t res = create(arg); // works
486 // create("bullshit"); // never works :-)
487 dfa = nfa->constructDFA<dfa_state_t,CreateDFAState&>(create); // finally
488 delete nfa;
489 valid = true;
490 }
491
492 } // namespace
493