1 //===- Lexer.h - Lexer for the Toy language -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a simple Lexer for the Toy language.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef MLIR_TUTORIAL_TOY_LEXER_H_
14 #define MLIR_TUTORIAL_TOY_LEXER_H_
15 
16 #include "llvm/ADT/StringRef.h"
17 
18 #include <memory>
19 #include <string>
20 
21 namespace toy {
22 
23 /// Structure definition a location in a file.
24 struct Location {
25   std::shared_ptr<std::string> file; ///< filename.
26   int line;                          ///< line number.
27   int col;                           ///< column number.
28 };
29 
30 // List of Token returned by the lexer.
31 enum Token : int {
32   tok_semicolon = ';',
33   tok_parenthese_open = '(',
34   tok_parenthese_close = ')',
35   tok_bracket_open = '{',
36   tok_bracket_close = '}',
37   tok_sbracket_open = '[',
38   tok_sbracket_close = ']',
39 
40   tok_eof = -1,
41 
42   // commands
43   tok_return = -2,
44   tok_var = -3,
45   tok_def = -4,
46 
47   // primary
48   tok_identifier = -5,
49   tok_number = -6,
50 };
51 
52 /// The Lexer is an abstract base class providing all the facilities that the
53 /// Parser expects. It goes through the stream one token at a time and keeps
54 /// track of the location in the file for debugging purpose.
55 /// It relies on a subclass to provide a `readNextLine()` method. The subclass
56 /// can proceed by reading the next line from the standard input or from a
57 /// memory mapped file.
58 class Lexer {
59 public:
60   /// Create a lexer for the given filename. The filename is kept only for
61   /// debugging purpose (attaching a location to a Token).
Lexer(std::string filename)62   Lexer(std::string filename)
63       : lastLocation(
64             {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
65   virtual ~Lexer() = default;
66 
67   /// Look at the current token in the stream.
getCurToken()68   Token getCurToken() { return curTok; }
69 
70   /// Move to the next token in the stream and return it.
getNextToken()71   Token getNextToken() { return curTok = getTok(); }
72 
73   /// Move to the next token in the stream, asserting on the current token
74   /// matching the expectation.
consume(Token tok)75   void consume(Token tok) {
76     assert(tok == curTok && "consume Token mismatch expectation");
77     getNextToken();
78   }
79 
80   /// Return the current identifier (prereq: getCurToken() == tok_identifier)
getId()81   llvm::StringRef getId() {
82     assert(curTok == tok_identifier);
83     return identifierStr;
84   }
85 
86   /// Return the current number (prereq: getCurToken() == tok_number)
getValue()87   double getValue() {
88     assert(curTok == tok_number);
89     return numVal;
90   }
91 
92   /// Return the location for the beginning of the current token.
getLastLocation()93   Location getLastLocation() { return lastLocation; }
94 
95   // Return the current line in the file.
getLine()96   int getLine() { return curLineNum; }
97 
98   // Return the current column in the file.
getCol()99   int getCol() { return curCol; }
100 
101 private:
102   /// Delegate to a derived class fetching the next line. Returns an empty
103   /// string to signal end of file (EOF). Lines are expected to always finish
104   /// with "\n"
105   virtual llvm::StringRef readNextLine() = 0;
106 
107   /// Return the next character from the stream. This manages the buffer for the
108   /// current line and request the next line buffer to the derived class as
109   /// needed.
getNextChar()110   int getNextChar() {
111     // The current line buffer should not be empty unless it is the end of file.
112     if (curLineBuffer.empty())
113       return EOF;
114     ++curCol;
115     auto nextchar = curLineBuffer.front();
116     curLineBuffer = curLineBuffer.drop_front();
117     if (curLineBuffer.empty())
118       curLineBuffer = readNextLine();
119     if (nextchar == '\n') {
120       ++curLineNum;
121       curCol = 0;
122     }
123     return nextchar;
124   }
125 
126   ///  Return the next token from standard input.
getTok()127   Token getTok() {
128     // Skip any whitespace.
129     while (isspace(lastChar))
130       lastChar = Token(getNextChar());
131 
132     // Save the current location before reading the token characters.
133     lastLocation.line = curLineNum;
134     lastLocation.col = curCol;
135 
136     // Identifier: [a-zA-Z][a-zA-Z0-9_]*
137     if (isalpha(lastChar)) {
138       identifierStr = (char)lastChar;
139       while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
140         identifierStr += (char)lastChar;
141 
142       if (identifierStr == "return")
143         return tok_return;
144       if (identifierStr == "def")
145         return tok_def;
146       if (identifierStr == "var")
147         return tok_var;
148       return tok_identifier;
149     }
150 
151     // Number: [0-9.]+
152     if (isdigit(lastChar) || lastChar == '.') {
153       std::string numStr;
154       do {
155         numStr += lastChar;
156         lastChar = Token(getNextChar());
157       } while (isdigit(lastChar) || lastChar == '.');
158 
159       numVal = strtod(numStr.c_str(), nullptr);
160       return tok_number;
161     }
162 
163     if (lastChar == '#') {
164       // Comment until end of line.
165       do {
166         lastChar = Token(getNextChar());
167       } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
168 
169       if (lastChar != EOF)
170         return getTok();
171     }
172 
173     // Check for end of file.  Don't eat the EOF.
174     if (lastChar == EOF)
175       return tok_eof;
176 
177     // Otherwise, just return the character as its ascii value.
178     Token thisChar = Token(lastChar);
179     lastChar = Token(getNextChar());
180     return thisChar;
181   }
182 
183   /// The last token read from the input.
184   Token curTok = tok_eof;
185 
186   /// Location for `curTok`.
187   Location lastLocation;
188 
189   /// If the current Token is an identifier, this string contains the value.
190   std::string identifierStr;
191 
192   /// If the current Token is a number, this contains the value.
193   double numVal = 0;
194 
195   /// The last value returned by getNextChar(). We need to keep it around as we
196   /// always need to read ahead one character to decide when to end a token and
197   /// we can't put it back in the stream after reading from it.
198   Token lastChar = Token(' ');
199 
200   /// Keep track of the current line number in the input stream
201   int curLineNum = 0;
202 
203   /// Keep track of the current column number in the input stream
204   int curCol = 0;
205 
206   /// Buffer supplied by the derived class on calls to `readNextLine()`
207   llvm::StringRef curLineBuffer = "\n";
208 };
209 
210 /// A lexer implementation operating on a buffer in memory.
211 class LexerBuffer final : public Lexer {
212 public:
LexerBuffer(const char * begin,const char * end,std::string filename)213   LexerBuffer(const char *begin, const char *end, std::string filename)
214       : Lexer(std::move(filename)), current(begin), end(end) {}
215 
216 private:
217   /// Provide one line at a time to the Lexer, return an empty string when
218   /// reaching the end of the buffer.
readNextLine()219   llvm::StringRef readNextLine() override {
220     auto *begin = current;
221     while (current <= end && *current && *current != '\n')
222       ++current;
223     if (current <= end && *current)
224       ++current;
225     llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
226     return result;
227   }
228   const char *current, *end;
229 };
230 } // namespace toy
231 
232 #endif // MLIR_TUTORIAL_TOY_LEXER_H_
233