1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the utils of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21 ** included in the packaging of this file. Please review the following
22 ** information to ensure the GNU General Public License requirements will
23 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24 **
25 ** $QT_END_LICENSE$
26 **
27 ****************************************************************************/
28 
29 #include "nfa.h"
30 #include "re2nfa.h"
31 #include "configfile.h"
32 #include "generator.h"
33 
34 #include <QFile>
35 #include <QCoreApplication>
36 #include <QFileInfo>
37 #include <QDateTime>
38 
39 struct Symbol
40 {
41     QString token;
42     QString lexem;
43 };
44 
tokenize(const DFA & dfa,const QString & input,Config * cfg,bool * ok=0)45 static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
46 {
47     QList<Symbol> symbols;
48     Symbol lastSymbol;
49     int state = 0;
50     int lastAcceptingState = -1;
51     QString lastAcceptingLexem;
52     int lastAcceptingPos = -1;
53     for (int i = 0; i < input.length(); ++i) {
54         QChar ch = input.at(i);
55         QChar chForInput = ch;
56         if (cfg->caseSensitivity == Qt::CaseInsensitive)
57             chForInput = chForInput.toLower();
58         int next = dfa.at(state).transitions.value(chForInput.unicode());
59         if (cfg->debug)
60             qDebug() << "input" << input.at(i) << "leads to state" << next;
61         if (next) {
62             lastSymbol.lexem.append(input.at(i));
63             lastSymbol.token = dfa.at(next).symbol;
64             if (!lastSymbol.token.isEmpty()) {
65                 lastAcceptingState = next;
66                 lastAcceptingLexem = lastSymbol.lexem;
67                 lastAcceptingPos = i;
68             }
69             state = next;
70         } else {
71             if (lastAcceptingState != -1) {
72                 if (cfg->debug)
73                     qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
74                 Symbol s;
75                 s.token = dfa.at(lastAcceptingState).symbol;
76                 s.lexem = lastAcceptingLexem;
77                 symbols << s;
78                 lastSymbol = Symbol();
79                 state = 0;
80                 i = lastAcceptingPos;
81                 lastAcceptingPos = -1;
82                 lastAcceptingState = -1;
83                 continue;
84             }
85             if (state == 0 || lastSymbol.token.isEmpty()) {
86                 if (cfg->debug)
87                     qDebug() << "invalid input";
88                 if (ok)
89                     *ok = false;
90                 return symbols;
91             }
92             if (cfg->debug)
93                 qDebug() << "appending symbol with token" << lastSymbol.token;
94             symbols << lastSymbol;
95             lastSymbol = Symbol();
96             state = 0;
97             lastAcceptingState = -1;
98             --i;
99         }
100     }
101     if (!lastSymbol.token.isEmpty()) {
102         if (cfg->debug)
103             qDebug() << "appending (last) symbol with token" << lastSymbol.token;
104         symbols << lastSymbol;
105     } else if (lastAcceptingState != -1) {
106         if (cfg->debug)
107             qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
108         Symbol s;
109         s.lexem = lastAcceptingLexem;
110         s.token = dfa.at(lastAcceptingState).symbol;
111         symbols << s;
112     }
113     if (ok)
114         *ok = true;
115     return symbols;
116 }
117 
determineMaxInputSet(const ConfigFile::Section & section)118 static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
119 {
120     QSet<InputType> set;
121 
122     QString inputTypeName;
123 
124     foreach (const ConfigFile::Entry &entry, section)
125         if (entry.key == QLatin1String("InputType")) {
126             if (!inputTypeName.isEmpty()) {
127                 qWarning("Error: InputType field specified multiple times in config file");
128                 return QSet<InputType>();
129             }
130             inputTypeName = entry.value;
131         }
132 
133     if (inputTypeName.isEmpty())
134         inputTypeName = "quint8";
135 
136     if (inputTypeName == "quint8") {
137         for (int i = 1; i < 256; ++i)
138             set.insert(i);
139     } /* else if ### */
140     else {
141         qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
142         return QSet<InputType>();
143     }
144 
145     return set;
146 }
147 
loadConfig(const QString & ruleFile,Config * cfg)148 static bool loadConfig(const QString &ruleFile, Config *cfg)
149 {
150     ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
151     if (sections.isEmpty()) {
152         qWarning("Error parsing %s", qPrintable(ruleFile));
153         return false;
154     }
155 
156     QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
157     if (maxInputSet.isEmpty())
158         return false;
159 
160     Qt::CaseSensitivity cs = Qt::CaseInsensitive;
161     if (sections.value("Options").contains("case-sensitive"))
162         cs = Qt::CaseSensitive;
163 
164     cfg->configSections = sections;
165     cfg->caseSensitivity = cs;
166     cfg->className = sections.value("Options").value("classname", "Scanner");
167     cfg->maxInputSet = maxInputSet;
168     cfg->ruleFile = ruleFile;
169     return true;
170 }
171 
generateMachine(const Config & cfg)172 static DFA generateMachine(const Config &cfg)
173 {
174     if (cfg.cache) {
175         QFileInfo ruleInfo(cfg.ruleFile);
176         QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
177         if (cacheInfo.exists()
178             && cacheInfo.lastModified() > ruleInfo.lastModified()) {
179             QFile f(cacheInfo.absoluteFilePath());
180             f.open(QIODevice::ReadOnly);
181             QDataStream stream(&f);
182             DFA machine;
183             stream >> machine;
184             return machine;
185         }
186     }
187 
188     QMap<QString, NFA> macros;
189     foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
190         int errCol = 0;
191         if (cfg.debug)
192             qDebug() << "parsing" << e.value;
193         NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
194         if (nfa.isEmpty()) {
195             qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
196             return DFA();
197         }
198         macros.insert(e.key, nfa);
199     }
200 
201     if (!cfg.configSections.contains("Tokens")) {
202         qWarning("Rule file does not contain a [Tokens] section!");
203         return DFA();
204     }
205 
206     QVector<NFA> tokens;
207 
208     foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
209         int errCol = 0;
210         if (cfg.debug)
211             qDebug() << "parsing" << e.value;
212         NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
213         if (tok.isEmpty()) {
214             qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
215             return DFA();
216         }
217         tok.setTerminationSymbol(e.key);
218         tokens.append(tok);
219     }
220 
221     NFA giganticStateMachine;
222     foreach (NFA nfa, tokens)
223         if (giganticStateMachine.isEmpty())
224             giganticStateMachine = nfa;
225         else
226             giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
227 
228     DFA result = giganticStateMachine.toDFA().minimize();
229     if (cfg.cache) {
230         QFileInfo ruleInfo(cfg.ruleFile);
231         QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
232         QFile f(cacheInfo.absoluteFilePath());
233         f.open(QIODevice::WriteOnly | QIODevice::Truncate);
234         QDataStream stream(&f);
235         stream << result;
236     }
237     return result;
238 }
239 
240 #if !defined(AUTOTEST)
main(int argc,char ** argv)241 int main(int argc, char **argv)
242 {
243     QCoreApplication app(argc, argv);
244     QString ruleFile;
245     Config cfg;
246 
247     const QStringList arguments = app.arguments().mid(1);
248     cfg.debug = arguments.contains("-debug");
249     const bool testRules = arguments.contains("-test");
250     cfg.cache = arguments.contains("-cache");
251 
252     foreach (const QString &arg, arguments)
253         if (!arg.startsWith(QLatin1Char('-'))) {
254             ruleFile = arg;
255             break;
256         }
257 
258     if (ruleFile.isEmpty()) {
259         qWarning("usage: lexgen [-debug] [-cache] [-test] rulefile");
260         qWarning(" ");
261         qWarning("    the -test option will cause lexgen to interpret standard input");
262         qWarning("    according to the specified rules and print out pairs of token and");
263         qWarning("    lexical element");
264         return 1;
265     }
266 
267     if (!loadConfig(ruleFile, &cfg))
268         return 1;
269 
270     DFA machine = generateMachine(cfg);
271     if (machine.isEmpty())
272         return 1;
273 
274     if (testRules) {
275         qWarning("Testing:");
276         QString input = QTextStream(stdin).readAll();
277         /*
278         qDebug() << "NFA has" << machine.stateCount() << "states";
279         qDebug() << "Converting to DFA... (this may take a while)";
280         DFA dfa = machine.toDFA();
281         qDebug() << "DFA has" << dfa.count() << "states";
282         qDebug() << "Minimizing...";
283         dfa = dfa.minimize();
284         qDebug() << "Minimized DFA has" << dfa.count() << "states";
285         */
286         DFA dfa = machine;
287         if (cfg.debug)
288             qDebug() << "tokenizing" << input;
289         bool ok = false;
290         QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
291         if (symbols.isEmpty()) {
292             qWarning("No tokens produced!");
293         } else {
294             foreach (Symbol s, symbols)
295                     qDebug() << s.token << ":" << s.lexem;
296         }
297         if (ok)
298             qDebug() << symbols.count() << "tokens produced.";
299         else
300             qDebug() << "Error while tokenizing!";
301     } else {
302         Generator gen(machine, cfg);
303         QTextStream(stdout)
304             << gen.generate();
305     }
306 
307     return 0;
308 }
309 #endif
310 
311