1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the utils of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21 ** included in the packaging of this file. Please review the following
22 ** information to ensure the GNU General Public License requirements will
23 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24 **
25 ** $QT_END_LICENSE$
26 **
27 ****************************************************************************/
28
29 #include "nfa.h"
30 #include "re2nfa.h"
31 #include "configfile.h"
32 #include "generator.h"
33
34 #include <QFile>
35 #include <QCoreApplication>
36 #include <QFileInfo>
37 #include <QDateTime>
38
39 struct Symbol
40 {
41 QString token;
42 QString lexem;
43 };
44
tokenize(const DFA & dfa,const QString & input,Config * cfg,bool * ok=0)45 static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
46 {
47 QList<Symbol> symbols;
48 Symbol lastSymbol;
49 int state = 0;
50 int lastAcceptingState = -1;
51 QString lastAcceptingLexem;
52 int lastAcceptingPos = -1;
53 for (int i = 0; i < input.length(); ++i) {
54 QChar ch = input.at(i);
55 QChar chForInput = ch;
56 if (cfg->caseSensitivity == Qt::CaseInsensitive)
57 chForInput = chForInput.toLower();
58 int next = dfa.at(state).transitions.value(chForInput.unicode());
59 if (cfg->debug)
60 qDebug() << "input" << input.at(i) << "leads to state" << next;
61 if (next) {
62 lastSymbol.lexem.append(input.at(i));
63 lastSymbol.token = dfa.at(next).symbol;
64 if (!lastSymbol.token.isEmpty()) {
65 lastAcceptingState = next;
66 lastAcceptingLexem = lastSymbol.lexem;
67 lastAcceptingPos = i;
68 }
69 state = next;
70 } else {
71 if (lastAcceptingState != -1) {
72 if (cfg->debug)
73 qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
74 Symbol s;
75 s.token = dfa.at(lastAcceptingState).symbol;
76 s.lexem = lastAcceptingLexem;
77 symbols << s;
78 lastSymbol = Symbol();
79 state = 0;
80 i = lastAcceptingPos;
81 lastAcceptingPos = -1;
82 lastAcceptingState = -1;
83 continue;
84 }
85 if (state == 0 || lastSymbol.token.isEmpty()) {
86 if (cfg->debug)
87 qDebug() << "invalid input";
88 if (ok)
89 *ok = false;
90 return symbols;
91 }
92 if (cfg->debug)
93 qDebug() << "appending symbol with token" << lastSymbol.token;
94 symbols << lastSymbol;
95 lastSymbol = Symbol();
96 state = 0;
97 lastAcceptingState = -1;
98 --i;
99 }
100 }
101 if (!lastSymbol.token.isEmpty()) {
102 if (cfg->debug)
103 qDebug() << "appending (last) symbol with token" << lastSymbol.token;
104 symbols << lastSymbol;
105 } else if (lastAcceptingState != -1) {
106 if (cfg->debug)
107 qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
108 Symbol s;
109 s.lexem = lastAcceptingLexem;
110 s.token = dfa.at(lastAcceptingState).symbol;
111 symbols << s;
112 }
113 if (ok)
114 *ok = true;
115 return symbols;
116 }
117
determineMaxInputSet(const ConfigFile::Section & section)118 static QSet<InputType> determineMaxInputSet(const ConfigFile::Section §ion)
119 {
120 QSet<InputType> set;
121
122 QString inputTypeName;
123
124 foreach (const ConfigFile::Entry &entry, section)
125 if (entry.key == QLatin1String("InputType")) {
126 if (!inputTypeName.isEmpty()) {
127 qWarning("Error: InputType field specified multiple times in config file");
128 return QSet<InputType>();
129 }
130 inputTypeName = entry.value;
131 }
132
133 if (inputTypeName.isEmpty())
134 inputTypeName = "quint8";
135
136 if (inputTypeName == "quint8") {
137 for (int i = 1; i < 256; ++i)
138 set.insert(i);
139 } /* else if ### */
140 else {
141 qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
142 return QSet<InputType>();
143 }
144
145 return set;
146 }
147
loadConfig(const QString & ruleFile,Config * cfg)148 static bool loadConfig(const QString &ruleFile, Config *cfg)
149 {
150 ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
151 if (sections.isEmpty()) {
152 qWarning("Error parsing %s", qPrintable(ruleFile));
153 return false;
154 }
155
156 QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
157 if (maxInputSet.isEmpty())
158 return false;
159
160 Qt::CaseSensitivity cs = Qt::CaseInsensitive;
161 if (sections.value("Options").contains("case-sensitive"))
162 cs = Qt::CaseSensitive;
163
164 cfg->configSections = sections;
165 cfg->caseSensitivity = cs;
166 cfg->className = sections.value("Options").value("classname", "Scanner");
167 cfg->maxInputSet = maxInputSet;
168 cfg->ruleFile = ruleFile;
169 return true;
170 }
171
generateMachine(const Config & cfg)172 static DFA generateMachine(const Config &cfg)
173 {
174 if (cfg.cache) {
175 QFileInfo ruleInfo(cfg.ruleFile);
176 QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
177 if (cacheInfo.exists()
178 && cacheInfo.lastModified() > ruleInfo.lastModified()) {
179 QFile f(cacheInfo.absoluteFilePath());
180 f.open(QIODevice::ReadOnly);
181 QDataStream stream(&f);
182 DFA machine;
183 stream >> machine;
184 return machine;
185 }
186 }
187
188 QMap<QString, NFA> macros;
189 foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
190 int errCol = 0;
191 if (cfg.debug)
192 qDebug() << "parsing" << e.value;
193 NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
194 if (nfa.isEmpty()) {
195 qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
196 return DFA();
197 }
198 macros.insert(e.key, nfa);
199 }
200
201 if (!cfg.configSections.contains("Tokens")) {
202 qWarning("Rule file does not contain a [Tokens] section!");
203 return DFA();
204 }
205
206 QVector<NFA> tokens;
207
208 foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
209 int errCol = 0;
210 if (cfg.debug)
211 qDebug() << "parsing" << e.value;
212 NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
213 if (tok.isEmpty()) {
214 qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
215 return DFA();
216 }
217 tok.setTerminationSymbol(e.key);
218 tokens.append(tok);
219 }
220
221 NFA giganticStateMachine;
222 foreach (NFA nfa, tokens)
223 if (giganticStateMachine.isEmpty())
224 giganticStateMachine = nfa;
225 else
226 giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
227
228 DFA result = giganticStateMachine.toDFA().minimize();
229 if (cfg.cache) {
230 QFileInfo ruleInfo(cfg.ruleFile);
231 QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
232 QFile f(cacheInfo.absoluteFilePath());
233 f.open(QIODevice::WriteOnly | QIODevice::Truncate);
234 QDataStream stream(&f);
235 stream << result;
236 }
237 return result;
238 }
239
240 #if !defined(AUTOTEST)
main(int argc,char ** argv)241 int main(int argc, char **argv)
242 {
243 QCoreApplication app(argc, argv);
244 QString ruleFile;
245 Config cfg;
246
247 const QStringList arguments = app.arguments().mid(1);
248 cfg.debug = arguments.contains("-debug");
249 const bool testRules = arguments.contains("-test");
250 cfg.cache = arguments.contains("-cache");
251
252 foreach (const QString &arg, arguments)
253 if (!arg.startsWith(QLatin1Char('-'))) {
254 ruleFile = arg;
255 break;
256 }
257
258 if (ruleFile.isEmpty()) {
259 qWarning("usage: lexgen [-debug] [-cache] [-test] rulefile");
260 qWarning(" ");
261 qWarning(" the -test option will cause lexgen to interpret standard input");
262 qWarning(" according to the specified rules and print out pairs of token and");
263 qWarning(" lexical element");
264 return 1;
265 }
266
267 if (!loadConfig(ruleFile, &cfg))
268 return 1;
269
270 DFA machine = generateMachine(cfg);
271 if (machine.isEmpty())
272 return 1;
273
274 if (testRules) {
275 qWarning("Testing:");
276 QString input = QTextStream(stdin).readAll();
277 /*
278 qDebug() << "NFA has" << machine.stateCount() << "states";
279 qDebug() << "Converting to DFA... (this may take a while)";
280 DFA dfa = machine.toDFA();
281 qDebug() << "DFA has" << dfa.count() << "states";
282 qDebug() << "Minimizing...";
283 dfa = dfa.minimize();
284 qDebug() << "Minimized DFA has" << dfa.count() << "states";
285 */
286 DFA dfa = machine;
287 if (cfg.debug)
288 qDebug() << "tokenizing" << input;
289 bool ok = false;
290 QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
291 if (symbols.isEmpty()) {
292 qWarning("No tokens produced!");
293 } else {
294 foreach (Symbol s, symbols)
295 qDebug() << s.token << ":" << s.lexem;
296 }
297 if (ok)
298 qDebug() << symbols.count() << "tokens produced.";
299 else
300 qDebug() << "Error while tokenizing!";
301 } else {
302 Generator gen(machine, cfg);
303 QTextStream(stdout)
304 << gen.generate();
305 }
306
307 return 0;
308 }
309 #endif
310
311