1 //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class can produce a generic deterministic finite state automaton (DFA),
10 // given a set of possible states and transitions.
11 //
12 // The input transitions can be nondeterministic - this class will produce the
13 // deterministic equivalent state machine.
14 //
15 // The generated code can run the DFA and produce an accepted / not accepted
16 // state and also produce, given a sequence of transitions that results in an
17 // accepted state, the sequence of intermediate states. This is useful if the
18 // initial automaton was nondeterministic - it allows mapping back from the DFA
19 // to the NFA.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "DFAEmitter.h"
24 #include "CodeGenTarget.h"
25 #include "SequenceToOffsetTable.h"
26 #include "TableGenBackends.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/UniqueVector.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/TableGen/Record.h"
33 #include "llvm/TableGen/TableGenBackend.h"
34 #include <cassert>
35 #include <cstdint>
36 #include <map>
37 #include <set>
38 #include <string>
39 #include <vector>
40 
41 #define DEBUG_TYPE "dfa-emitter"
42 
43 using namespace llvm;
44 
45 //===----------------------------------------------------------------------===//
46 // DfaEmitter implementation. This is independent of the GenAutomaton backend.
47 //===----------------------------------------------------------------------===//
48 
49 void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
50   Actions.insert(A);
51   NfaStates.insert(From);
52   NfaStates.insert(To);
53   NfaTransitions[{From, A}].push_back(To);
54   ++NumNfaTransitions;
55 }
56 
57 void DfaEmitter::visitDfaState(const DfaState &DS) {
58   // For every possible action...
59   auto FromId = DfaStates.idFor(DS);
60   for (action_type A : Actions) {
61     DfaState NewStates;
62     DfaTransitionInfo TI;
63     // For every represented state, word pair in the original NFA...
64     for (state_type FromState : DS) {
65       // If this action is possible from this state add the transitioned-to
66       // states to NewStates.
67       auto I = NfaTransitions.find({FromState, A});
68       if (I == NfaTransitions.end())
69         continue;
70       for (state_type &ToState : I->second) {
71         NewStates.push_back(ToState);
72         TI.emplace_back(FromState, ToState);
73       }
74     }
75     if (NewStates.empty())
76       continue;
77     // Sort and unique.
78     sort(NewStates);
79     NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
80                     NewStates.end());
81     sort(TI);
82     TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
83     unsigned ToId = DfaStates.insert(NewStates);
84     DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
85   }
86 }
87 
88 void DfaEmitter::constructDfa() {
89   DfaState Initial(1, /*NFA initial state=*/0);
90   DfaStates.insert(Initial);
91 
92   // Note that UniqueVector starts indices at 1, not zero.
93   unsigned DfaStateId = 1;
94   while (DfaStateId <= DfaStates.size()) {
95     DfaState S = DfaStates[DfaStateId];
96     visitDfaState(S);
97     DfaStateId++;
98   }
99 }
100 
101 void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
102   constructDfa();
103 
104   OS << "// Input NFA has " << NfaStates.size() << " states with "
105      << NumNfaTransitions << " transitions.\n";
106   OS << "// Generated DFA has " << DfaStates.size() << " states with "
107      << DfaTransitions.size() << " transitions.\n\n";
108 
109   // Implementation note: We don't bake a simple std::pair<> here as it requires
110   // significantly more effort to parse. A simple test with a large array of
111   // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
112   // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
113   // define the pair type.
114   //
115   // FIXME: It may make sense to emit these as ULEB sequences instead of
116   // pairs of uint64_t.
117   OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
118   OS << "// transition implies a set of NFA transitions. These are referred\n";
119   OS << "// to by index in " << Name << "Transitions[].\n";
120 
121   SequenceToOffsetTable<DfaTransitionInfo> Table;
122   std::map<DfaTransitionInfo, unsigned> EmittedIndices;
123   for (auto &T : DfaTransitions)
124     Table.add(T.second.second);
125   Table.layout();
126   OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
127      << "TransitionInfo = {{\n";
128   Table.emit(
129       OS,
130       [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
131         OS << "{" << P.first << ", " << P.second << "}";
132       },
133       "{0ULL, 0ULL}");
134 
135   OS << "}};\n\n";
136 
137   OS << "// A transition in the generated " << Name << " DFA.\n";
138   OS << "struct " << Name << "Transition {\n";
139   OS << "  unsigned FromDfaState; // The transitioned-from DFA state.\n";
140   OS << "  ";
141   printActionType(OS);
142   OS << " Action;       // The input symbol that causes this transition.\n";
143   OS << "  unsigned ToDfaState;   // The transitioned-to DFA state.\n";
144   OS << "  unsigned InfoIdx;      // Start index into " << Name
145      << "TransitionInfo.\n";
146   OS << "};\n\n";
147 
148   OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
149   OS << "// The initial state is 1, not zero.\n";
150   OS << "const std::array<" << Name << "Transition, "
151      << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
152   for (auto &KV : DfaTransitions) {
153     dfa_state_type From = KV.first.first;
154     dfa_state_type To = KV.second.first;
155     action_type A = KV.first.second;
156     unsigned InfoIdx = Table.get(KV.second.second);
157     OS << "  {" << From << ", ";
158     printActionValue(A, OS);
159     OS << ", " << To << ", " << InfoIdx << "},\n";
160   }
161   OS << "\n}};\n\n";
162 }
163 
164 void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
165 
166 void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
167 
168 //===----------------------------------------------------------------------===//
169 // AutomatonEmitter implementation
170 //===----------------------------------------------------------------------===//
171 
172 namespace {
173 // FIXME: This entire discriminated union could be removed with c++17:
174 //   using Action = std::variant<Record *, unsigned, std::string>;
175 struct Action {
176   Record *R = nullptr;
177   unsigned I = 0;
178   std::string S;
179 
180   Action() = default;
181   Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
182 
183   void print(raw_ostream &OS) const {
184     if (R)
185       OS << R->getName();
186     else if (!S.empty())
187       OS << '"' << S << '"';
188     else
189       OS << I;
190   }
191   bool operator<(const Action &Other) const {
192     return std::make_tuple(R, I, S) <
193            std::make_tuple(Other.R, Other.I, Other.S);
194   }
195 };
196 
197 using ActionTuple = std::vector<Action>;
198 class Automaton;
199 
200 class Transition {
201   uint64_t NewState;
202   // The tuple of actions that causes this transition.
203   ActionTuple Actions;
204   // The types of the actions; this is the same across all transitions.
205   SmallVector<std::string, 4> Types;
206 
207 public:
208   Transition(Record *R, Automaton *Parent);
209   const ActionTuple &getActions() { return Actions; }
210   SmallVector<std::string, 4> getTypes() { return Types; }
211 
212   bool canTransitionFrom(uint64_t State);
213   uint64_t transitionFrom(uint64_t State);
214 };
215 
216 class Automaton {
217   RecordKeeper &Records;
218   Record *R;
219   std::vector<Transition> Transitions;
220   /// All possible action tuples, uniqued.
221   UniqueVector<ActionTuple> Actions;
222   /// The fields within each Transition object to find the action symbols.
223   std::vector<StringRef> ActionSymbolFields;
224 
225 public:
226   Automaton(RecordKeeper &Records, Record *R);
227   void emit(raw_ostream &OS);
228 
229   ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
230   /// If the type of action A has been overridden (there exists a field
231   /// "TypeOf_A") return that, otherwise return the empty string.
232   StringRef getActionSymbolType(StringRef A);
233 };
234 
235 class AutomatonEmitter {
236   RecordKeeper &Records;
237 
238 public:
239   AutomatonEmitter(RecordKeeper &R) : Records(R) {}
240   void run(raw_ostream &OS);
241 };
242 
243 /// A DfaEmitter implementation that can print our variant action type.
244 class CustomDfaEmitter : public DfaEmitter {
245   const UniqueVector<ActionTuple> &Actions;
246   std::string TypeName;
247 
248 public:
249   CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
250       : Actions(Actions), TypeName(TypeName) {}
251 
252   void printActionType(raw_ostream &OS) override;
253   void printActionValue(action_type A, raw_ostream &OS) override;
254 };
255 } // namespace
256 
257 void AutomatonEmitter::run(raw_ostream &OS) {
258   for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
259     Automaton A(Records, R);
260     OS << "#ifdef GET_" << R->getName() << "_DECL\n";
261     A.emit(OS);
262     OS << "#endif  // GET_" << R->getName() << "_DECL\n";
263   }
264 }
265 
266 Automaton::Automaton(RecordKeeper &Records, Record *R)
267     : Records(Records), R(R) {
268   LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
269   ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
270 }
271 
272 void Automaton::emit(raw_ostream &OS) {
273   StringRef TransitionClass = R->getValueAsString("TransitionClass");
274   for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
275     assert(T->isSubClassOf("Transition"));
276     Transitions.emplace_back(T, this);
277     Actions.insert(Transitions.back().getActions());
278   }
279 
280   LLVM_DEBUG(dbgs() << "  Action alphabet cardinality: " << Actions.size()
281                     << "\n");
282   LLVM_DEBUG(dbgs() << "  Each state has " << Transitions.size()
283                     << " potential transitions.\n");
284 
285   StringRef Name = R->getName();
286 
287   CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
288   // Starting from the initial state, build up a list of possible states and
289   // transitions.
290   std::deque<uint64_t> Worklist(1, 0);
291   std::set<uint64_t> SeenStates;
292   unsigned NumTransitions = 0;
293   SeenStates.insert(Worklist.front());
294   while (!Worklist.empty()) {
295     uint64_t State = Worklist.front();
296     Worklist.pop_front();
297     for (Transition &T : Transitions) {
298       if (!T.canTransitionFrom(State))
299         continue;
300       uint64_t NewState = T.transitionFrom(State);
301       if (SeenStates.emplace(NewState).second)
302         Worklist.emplace_back(NewState);
303       ++NumTransitions;
304       Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
305     }
306   }
307   LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
308                     << " states with " << NumTransitions << " transitions.\n");
309 
310   const auto &ActionTypes = Transitions.back().getTypes();
311   OS << "// The type of an action in the " << Name << " automaton.\n";
312   if (ActionTypes.size() == 1) {
313     OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
314   } else {
315     OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
316        << ">;\n";
317   }
318   OS << "\n";
319 
320   Emitter.emit(Name, OS);
321 }
322 
323 StringRef Automaton::getActionSymbolType(StringRef A) {
324   Twine Ty = "TypeOf_" + A;
325   if (!R->getValue(Ty.str()))
326     return "";
327   return R->getValueAsString(Ty.str());
328 }
329 
330 Transition::Transition(Record *R, Automaton *Parent) {
331   BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
332   NewState = 0;
333   assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
334          "State cannot be represented in 64 bits!");
335   for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
336     if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
337       if (Bit->getValue())
338         NewState |= 1ULL << I;
339     }
340   }
341 
342   for (StringRef A : Parent->getActionSymbolFields()) {
343     RecordVal *SymbolV = R->getValue(A);
344     if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
345       Actions.emplace_back(R->getValueAsDef(A), 0, "");
346       Types.emplace_back(Ty->getAsString());
347     } else if (isa<IntRecTy>(SymbolV->getType())) {
348       Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
349       Types.emplace_back("unsigned");
350     } else if (isa<StringRecTy>(SymbolV->getType())) {
351       Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
352       Types.emplace_back("std::string");
353     } else {
354       report_fatal_error("Unhandled symbol type!");
355     }
356 
357     StringRef TypeOverride = Parent->getActionSymbolType(A);
358     if (!TypeOverride.empty())
359       Types.back() = std::string(TypeOverride);
360   }
361 }
362 
363 bool Transition::canTransitionFrom(uint64_t State) {
364   if ((State & NewState) == 0)
365     // The bits we want to set are not set;
366     return true;
367   return false;
368 }
369 
370 uint64_t Transition::transitionFrom(uint64_t State) {
371   return State | NewState;
372 }
373 
374 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
375 
376 void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
377   const ActionTuple &AT = Actions[A];
378   if (AT.size() > 1)
379     OS << "std::make_tuple(";
380   ListSeparator LS;
381   for (const auto &SingleAction : AT) {
382     OS << LS;
383     SingleAction.print(OS);
384   }
385   if (AT.size() > 1)
386     OS << ")";
387 }
388 
389 namespace llvm {
390 
391 void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
392   AutomatonEmitter(RK).run(OS);
393 }
394 
395 } // namespace llvm
396