1 //===-- Lexer.cpp ---------------------------------------------------------===//
2 //
3 //                     The KLEE Symbolic Virtual Machine
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "klee/Expr/Parser/Lexer.h"
11 
12 #include "llvm/Support/MemoryBuffer.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 #include <iomanip>
16 #include <string.h>
17 
18 using namespace llvm;
19 using namespace klee;
20 using namespace klee::expr;
21 
22 ///
23 
getKindName() const24 const char *Token::getKindName() const {
25   switch (kind) {
26   default:
27   case Unknown:    return "Unknown";
28   case Arrow:      return "Arrow";
29   case At:         return "At";
30   case Colon:      return "Colon";
31   case Comma:      return "Comma";
32   case Comment:    return "Comment";
33   case EndOfFile:  return "EndOfFile";
34   case Equals:     return "Equals";
35   case Identifier: return "Identifier";
36   case KWArray:    return "KWArray";
37   case KWFalse:    return "KWFalse";
38   case KWQuery:    return "KWQuery";
39   case KWReserved: return "KWReserved";
40   case KWSymbolic: return "KWSymbolic";
41   case KWTrue:     return "KWTrue";
42   case KWWidth:    return "KWWidth";
43   case LBrace:     return "LBrace";
44   case LParen:     return "LParen";
45   case LSquare:    return "LSquare";
46   case Number:     return "Number";
47   case RBrace:     return "RBrace";
48   case RParen:     return "RParen";
49   case RSquare:    return "RSquare";
50   case Semicolon:  return "Semicolon";
51   }
52 }
53 
dump()54 void Token::dump() {
55   llvm::errs() << "(Token \"" << getKindName() << "\" "
56                << (const void*) start << " " << length << " "
57                << line << " " << column << ")";
58 }
59 
60 ///
61 
isInternalIdentifierChar(int Char)62 static inline bool isInternalIdentifierChar(int Char) {
63   return isalnum(Char) || Char == '_' || Char == '.' || Char == '-';
64 }
65 
Lexer(const llvm::MemoryBuffer * MB)66 Lexer::Lexer(const llvm::MemoryBuffer *MB)
67   : BufferPos(MB->getBufferStart()), BufferEnd(MB->getBufferEnd()),
68     LineNumber(1), ColumnNumber(0) {
69 }
70 
~Lexer()71 Lexer::~Lexer() {
72 }
73 
PeekNextChar()74 int Lexer::PeekNextChar() {
75   if (BufferPos == BufferEnd)
76     return -1;
77   return *BufferPos;
78 }
79 
GetNextChar()80 int Lexer::GetNextChar() {
81   if (BufferPos == BufferEnd)
82     return -1;
83 
84   // Handle DOS/Mac newlines here, by stripping duplicates and by
85   // returning '\n' for both.
86   char Result = *BufferPos++;
87   if (Result == '\n' || Result == '\r') {
88     if (BufferPos != BufferEnd && *BufferPos == ('\n' + '\r' - Result))
89       ++BufferPos;
90     Result = '\n';
91   }
92 
93   if (Result == '\n') {
94     ++LineNumber;
95     ColumnNumber = 0;
96   } else {
97     ++ColumnNumber;
98   }
99 
100   return Result;
101 }
102 
SetTokenKind(Token & Result,Token::Kind k)103 Token &Lexer::SetTokenKind(Token &Result, Token::Kind k) {
104   Result.kind = k;
105   Result.length = BufferPos - Result.start;
106   return Result;
107 }
108 
isReservedKW(const char * Str,unsigned N)109 static bool isReservedKW(const char *Str, unsigned N) {
110     unsigned i;
111 
112   // Check for i[0-9]+
113   if (N>1 && Str[0] == 'i') {
114     for (i=1; i<N; ++i)
115       if (!isdigit(Str[i]))
116         break;
117     if (i==N)
118       return true;
119   }
120 
121   // Check for fp[0-9]+([.].*)?$
122   if (N>3 && Str[0]=='f' && Str[1]=='p' && isdigit(Str[2])) {
123     for (i=3; i<N; ++i)
124       if (!isdigit(Str[i]))
125         break;
126     if (i==N || Str[i]=='.')
127       return true;
128   }
129 
130   return false;
131 }
isWidthKW(const char * Str,unsigned N)132 static bool isWidthKW(const char *Str, unsigned N) {
133   if (N<2 || Str[0] != 'w')
134     return false;
135   for (unsigned i=1; i<N; ++i)
136     if (!isdigit(Str[i]))
137       return false;
138   return true;
139 }
SetIdentifierTokenKind(Token & Result)140 Token &Lexer::SetIdentifierTokenKind(Token &Result) {
141   unsigned Length = BufferPos - Result.start;
142   switch (Length) {
143   case 3:
144     if (memcmp("def", Result.start, 3) == 0)
145       return SetTokenKind(Result, Token::KWReserved);
146     if (memcmp("var", Result.start, 3) == 0)
147       return SetTokenKind(Result, Token::KWReserved);
148     break;
149 
150   case 4:
151     if (memcmp("true", Result.start, 4) == 0)
152       return SetTokenKind(Result, Token::KWTrue);
153     break;
154 
155   case 5:
156     if (memcmp("array", Result.start, 5) == 0)
157       return SetTokenKind(Result, Token::KWArray);
158     if (memcmp("false", Result.start, 5) == 0)
159       return SetTokenKind(Result, Token::KWFalse);
160     if (memcmp("query", Result.start, 5) == 0)
161       return SetTokenKind(Result, Token::KWQuery);
162     break;
163 
164   case 6:
165     if (memcmp("define", Result.start, 6) == 0)
166       return SetTokenKind(Result, Token::KWReserved);
167     break;
168 
169   case 7:
170     if (memcmp("declare", Result.start, 7) == 0)
171       return SetTokenKind(Result, Token::KWReserved);
172     break;
173 
174   case 8:
175     if (memcmp("symbolic", Result.start, 8) == 0)
176       return SetTokenKind(Result, Token::KWSymbolic);
177     break;
178   }
179 
180   if (isReservedKW(Result.start, Length))
181     return SetTokenKind(Result, Token::KWReserved);
182   if (isWidthKW(Result.start, Length))
183     return SetTokenKind(Result, Token::KWWidth);
184 
185   return SetTokenKind(Result, Token::Identifier);
186 }
187 
SkipToEndOfLine()188 void Lexer::SkipToEndOfLine() {
189   for (;;) {
190     int Char = GetNextChar();
191     if (Char == -1 || Char =='\n')
192       break;
193   }
194 }
195 
LexNumber(Token & Result)196 Token &Lexer::LexNumber(Token &Result) {
197   while (isalnum(PeekNextChar()) || PeekNextChar()=='_')
198     GetNextChar();
199   return SetTokenKind(Result, Token::Number);
200 }
201 
LexIdentifier(Token & Result)202 Token &Lexer::LexIdentifier(Token &Result) {
203   while (isInternalIdentifierChar(PeekNextChar()))
204     GetNextChar();
205 
206   // Recognize keywords specially.
207   return SetIdentifierTokenKind(Result);
208 }
209 
Lex(Token & Result)210 Token &Lexer::Lex(Token &Result) {
211   Result.kind = Token::Unknown;
212   Result.length = 0;
213   Result.start = BufferPos;
214 
215   // Skip whitespace.
216   while (isspace(PeekNextChar()))
217     GetNextChar();
218 
219   Result.start = BufferPos;
220   Result.line = LineNumber;
221   Result.column = ColumnNumber;
222   int Char = GetNextChar();
223   switch (Char) {
224   case -1:  return SetTokenKind(Result, Token::EndOfFile);
225 
226   case '(': return SetTokenKind(Result, Token::LParen);
227   case ')': return SetTokenKind(Result, Token::RParen);
228   case ',': return SetTokenKind(Result, Token::Comma);
229   case ':': return SetTokenKind(Result, Token::Colon);
230   case ';': return SetTokenKind(Result, Token::Semicolon);
231   case '=': return SetTokenKind(Result, Token::Equals);
232   case '@': return SetTokenKind(Result, Token::At);
233   case '[': return SetTokenKind(Result, Token::LSquare);
234   case ']': return SetTokenKind(Result, Token::RSquare);
235   case '{': return SetTokenKind(Result, Token::LBrace);
236   case '}': return SetTokenKind(Result, Token::RBrace);
237 
238   case '#':
239     SkipToEndOfLine();
240     return SetTokenKind(Result, Token::Comment);
241 
242   case '+': {
243     if (isdigit(PeekNextChar()))
244       return LexNumber(Result);
245     else
246       return SetTokenKind(Result, Token::Unknown);
247   }
248 
249   case '-': {
250     int Next = PeekNextChar();
251     if (Next == '>')
252       return GetNextChar(), SetTokenKind(Result, Token::Arrow);
253     else if (isdigit(Next))
254       return LexNumber(Result);
255     else
256       return SetTokenKind(Result, Token::Unknown);
257     break;
258   }
259 
260   default:
261     if (isdigit(Char))
262       return LexNumber(Result);
263     else if (isalpha(Char) || Char == '_')
264       return LexIdentifier(Result);
265     return SetTokenKind(Result, Token::Unknown);
266   }
267 }
268