1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/sksl/lex/NFAtoDFA.h"
9 #include "src/sksl/lex/RegexParser.h"
10 
11 #include <fstream>
12 #include <sstream>
13 #include <string>
14 
15 /**
16  * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
17  * file is a text file with one token definition per line. Each line is of the form:
18  * <TOKEN_NAME> = <pattern>
19  * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
20  */
21 
22 static constexpr const char* HEADER =
23     "/*\n"
24     " * Copyright 2017 Google Inc.\n"
25     " *\n"
26     " * Use of this source code is governed by a BSD-style license that can be\n"
27     " * found in the LICENSE file.\n"
28     " */\n"
29     "/*****************************************************************************************\n"
30     " ******************** This file was generated by sksllex. Do not edit. *******************\n"
31     " *****************************************************************************************/\n";
32 
writeH(const DFA & dfa,const char * lexer,const char * token,const std::vector<std::string> & tokens,const char * hPath)33 void writeH(const DFA& dfa, const char* lexer, const char* token,
34             const std::vector<std::string>& tokens, const char* hPath) {
35     std::ofstream out(hPath);
36     SkASSERT(out.good());
37     out << HEADER;
38     out << "#ifndef SKSL_" << lexer << "\n";
39     out << "#define SKSL_" << lexer << "\n";
40     out << "#include <cstddef>\n";
41     out << "#include <cstdint>\n";
42     out << "namespace SkSL {\n";
43     out << "\n";
44     out << "struct " << token << " {\n";
45     out << "    enum class Kind {\n";
46     for (const std::string& t : tokens) {
47         out << "        TK_" << t << ",\n";
48     }
49     out << "    };\n";
50     out << "\n";
51     out << "    " << token << "()\n";
52     out << "    : fKind(Kind::TK_INVALID)\n";
53     out << "    , fOffset(-1)\n";
54     out << "    , fLength(-1) {}\n";
55     out << "\n";
56     out << "    " << token << "(Kind kind, int32_t offset, int32_t length)\n";
57     out << "    : fKind(kind)\n";
58     out << "    , fOffset(offset)\n";
59     out << "    , fLength(length) {}\n";
60     out << "\n";
61     out << "    Kind fKind;\n";
62     out << "    int fOffset;\n";
63     out << "    int fLength;\n";
64     out << "};\n";
65     out << "\n";
66     out << "class " << lexer << " {\n";
67     out << "public:\n";
68     out << "    void start(const char* text, int32_t length) {\n";
69     out << "        fText = text;\n";
70     out << "        fLength = length;\n";
71     out << "        fOffset = 0;\n";
72     out << "    }\n";
73     out << "\n";
74     out << "    " << token << " next();\n";
75     out << "\n";
76     out << "private:\n";
77     out << "    const char* fText;\n";
78     out << "    int32_t fLength;\n";
79     out << "    int32_t fOffset;\n";
80     out << "};\n";
81     out << "\n";
82     out << "} // namespace\n";
83     out << "#endif\n";
84 }
85 
writeCPP(const DFA & dfa,const char * lexer,const char * token,const char * include,const char * cppPath)86 void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
87               const char* cppPath) {
88     std::ofstream out(cppPath);
89     SkASSERT(out.good());
90     out << HEADER;
91     out << "#include \"" << include << "\"\n";
92     out << "\n";
93     out << "namespace SkSL {\n";
94     out << "\n";
95 
96     size_t states = 0;
97     for (const auto& row : dfa.fTransitions) {
98         states = std::max(states, row.size());
99     }
100     // arbitrarily-chosen character which is greater than START_CHAR and should not appear in actual
101     // input
102     out << "static const uint8_t INVALID_CHAR = 18;";
103     out << "static int8_t mappings[" << dfa.fCharMappings.size() << "] = {\n    ";
104     const char* separator = "";
105     for (int m : dfa.fCharMappings) {
106         out << separator << std::to_string(m);
107         separator = ", ";
108     }
109     out << "\n};\n";
110     out << "static int16_t transitions[" << dfa.fTransitions.size() << "][" << states << "] = {\n";
111     for (size_t c = 0; c < dfa.fTransitions.size(); ++c) {
112         out << "    {";
113         for (size_t j = 0; j < states; ++j) {
114             if ((size_t) c < dfa.fTransitions.size() && j < dfa.fTransitions[c].size()) {
115                 out << " " << dfa.fTransitions[c][j] << ",";
116             } else {
117                 out << " 0,";
118             }
119         }
120         out << " },\n";
121     }
122     out << "};\n";
123     out << "\n";
124 
125     out << "static int8_t accepts[" << states << "] = {";
126     for (size_t i = 0; i < states; ++i) {
127         if (i < dfa.fAccepts.size()) {
128             out << " " << dfa.fAccepts[i] << ",";
129         } else {
130             out << " " << INVALID << ",";
131         }
132     }
133     out << " };\n";
134     out << "\n";
135 
136     out << token << " " << lexer << "::next() {\n";
137     out << "    // note that we cheat here: normally a lexer needs to worry about the case\n";
138     out << "    // where a token has a prefix which is not itself a valid token - for instance, \n";
139     out << "    // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid\n";
140     out << "    // tokens. Our grammar doesn't have this property, so we can simplify the logic\n";
141     out << "    // a bit.\n";
142     out << "    int32_t startOffset = fOffset;\n";
143     out << "    if (startOffset == fLength) {\n";
144     out << "        return " << token << "(" << token << "::Kind::TK_END_OF_FILE, startOffset,"
145            "0);\n";
146     out << "    }\n";
147     out << "    int16_t state = 1;\n";
148     out << "    for (;;) {\n";
149     out << "        if (fOffset >= fLength) {\n";
150     out << "            if (accepts[state] == -1) {\n";
151     out << "                return Token(Token::Kind::TK_END_OF_FILE, startOffset, 0);\n";
152     out << "            }\n";
153     out << "            break;\n";
154     out << "        }\n";
155     out << "        uint8_t c = (uint8_t) fText[fOffset];";
156     out << "        if (c <= 8 || c >= " << dfa.fCharMappings.size() << ") {";
157     out << "            c = INVALID_CHAR;";
158     out << "        }";
159     out << "        int16_t newState = transitions[mappings[c]][state];\n";
160     out << "        if (!newState) {\n";
161     out << "            break;\n";
162     out << "        }\n";
163     out << "        state = newState;";
164     out << "        ++fOffset;\n";
165     out << "    }\n";
166     out << "    Token::Kind kind = (" << token << "::Kind) accepts[state];\n";
167     out << "    return " << token << "(kind, startOffset, fOffset - startOffset);\n";
168     out << "}\n";
169     out << "\n";
170     out << "} // namespace\n";
171 }
172 
process(const char * inPath,const char * lexer,const char * token,const char * hPath,const char * cppPath)173 void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
174              const char* cppPath) {
175     NFA nfa;
176     std::vector<std::string> tokens;
177     tokens.push_back("END_OF_FILE");
178     std::string line;
179     std::ifstream in(inPath);
180     while (std::getline(in, line)) {
181         if (line.length() == 0) {
182             continue;
183         }
184         if (line.length() >= 2 && line[0] == '/' && line[1] == '/') {
185             continue;
186         }
187         std::istringstream split(line);
188         std::string name, delimiter, pattern;
189         if (split >> name >> delimiter >> pattern) {
190             SkASSERT(split.eof());
191             SkASSERT(name != "");
192             SkASSERT(delimiter == "=");
193             SkASSERT(pattern != "");
194             tokens.push_back(name);
195             if (pattern[0] == '"') {
196                 SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
197                 RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
198                 for (size_t i = 2; i < pattern.size() - 1; ++i) {
199                     node = RegexNode(RegexNode::kConcat_Kind, node,
200                                      RegexNode(RegexNode::kChar_Kind, pattern[i]));
201                 }
202                 nfa.addRegex(node);
203             }
204             else {
205                 nfa.addRegex(RegexParser().parse(pattern));
206             }
207         }
208     }
209     NFAtoDFA converter(&nfa);
210     DFA dfa = converter.convert();
211     writeH(dfa, lexer, token, tokens, hPath);
212     writeCPP(dfa, lexer, token, (std::string("src/sksl/SkSL") + lexer + ".h").c_str(), cppPath);
213 }
214 
main(int argc,const char ** argv)215 int main(int argc, const char** argv) {
216     if (argc != 6) {
217         printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
218         exit(1);
219     }
220     process(argv[1], argv[2], argv[3], argv[4], argv[5]);
221     return 0;
222 }
223