1 // tokenizer.cpp
2 //
3 // Copyright (C) 2001 Chris Laurel <claurel@shatters.net>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License
7 // as published by the Free Software Foundation; either version 2
8 // of the License, or (at your option) any later version.
9 
10 #include <cctype>
11 #include <cmath>
12 #include <iomanip>
13 #include <celutil/utf8.h>
14 #include "tokenizer.h"
15 
16 
issep(char c)17 static bool issep(char c)
18 {
19     return !isdigit(c) && !isalpha(c) && c != '.';
20 }
21 
22 
Tokenizer(istream * _in)23 Tokenizer::Tokenizer(istream* _in) :
24     in(_in),
25     tokenType(TokenBegin),
26     haveValidNumber(false),
27     haveValidName(false),
28     haveValidString(false),
29     pushedBack(false),
30     lineNum(1)
31 {
32 }
33 
34 
nextToken()35 Tokenizer::TokenType Tokenizer::nextToken()
36 {
37     State state = StartState;
38 
39     if (pushedBack)
40     {
41         pushedBack = false;
42         return tokenType;
43     }
44 
45     textToken = "";
46     haveValidNumber = false;
47     haveValidName = false;
48     haveValidString = false;
49 
50     if (tokenType == TokenBegin)
51     {
52         nextChar = readChar();
53         if (in->eof())
54             return TokenEnd;
55     }
56     else if (tokenType == TokenEnd)
57     {
58         return tokenType;
59     }
60 
61     double integerValue = 0;
62     double fractionValue = 0;
63     double sign = 1;
64     double fracExp = 1;
65     double exponentValue = 0;
66     double exponentSign = 1;
67 
68     TokenType newToken = TokenBegin;
69     while (newToken == TokenBegin)
70     {
71         switch (state)
72         {
73         case StartState:
74             if (isspace(nextChar))
75             {
76                 state = StartState;
77             }
78             else if (isdigit(nextChar))
79             {
80                 state = NumberState;
81                 integerValue = (int) nextChar - (int) '0';
82             }
83             else if (nextChar == '-')
84             {
85                 state = NumberState;
86                 sign = -1;
87                 integerValue = 0;
88             }
89             else if (nextChar == '+')
90             {
91                 state = NumberState;
92                 sign = +1;
93                 integerValue = 0;
94             }
95             else if (nextChar == '.')
96             {
97                 state = FractionState;
98                 sign = +1;
99                 integerValue = 0;
100             }
101             else if (isalpha(nextChar) || nextChar == '_')
102             {
103                 state = NameState;
104                 textToken += (char) nextChar;
105             }
106             else if (nextChar == '#')
107             {
108                 state = CommentState;
109             }
110             else if (nextChar == '"')
111             {
112                 state = StringState;
113             }
114             else if (nextChar == '{')
115             {
116                 newToken = TokenBeginGroup;
117                 nextChar = readChar();
118             }
119             else if (nextChar == '}')
120             {
121                 newToken = TokenEndGroup;
122                 nextChar = readChar();
123             }
124             else if (nextChar == '[')
125             {
126                 newToken = TokenBeginArray;
127                 nextChar = readChar();
128             }
129             else if (nextChar == ']')
130             {
131                 newToken = TokenEndArray;
132                 nextChar = readChar();
133             }
134             else if (nextChar == '=')
135             {
136                 newToken = TokenEquals;
137                 nextChar = readChar();
138             }
139             else if (nextChar == '|')
140             {
141                 newToken = TokenBar;
142                 nextChar = readChar();
143             }
144             else if (nextChar == -1)
145             {
146                 newToken = TokenEnd;
147             }
148             else
149             {
150                 newToken = TokenError;
151                 syntaxError("Bad character in stream");
152             }
153             break;
154 
155         case NameState:
156             if (isalpha(nextChar) || isdigit(nextChar) || nextChar == '_')
157             {
158                 state = NameState;
159                 textToken += (char) nextChar;
160             }
161             else
162             {
163                 newToken = TokenName;
164                 haveValidName = true;
165             }
166             break;
167 
168         case CommentState:
169             if (nextChar == '\n' || nextChar == '\r' || nextChar == char_traits<char>::eof())
170                 state = StartState;
171             break;
172 
173         case StringState:
174             if (nextChar == '"')
175             {
176                 newToken = TokenString;
177                 haveValidString = true;
178                 nextChar = readChar();
179             }
180             else if (nextChar == '\\')
181             {
182                 state = StringEscapeState;
183             }
184             else if (nextChar == char_traits<char>::eof())
185             {
186                 newToken = TokenError;
187                 syntaxError("Unterminated string");
188             }
189             else
190             {
191                 state = StringState;
192                 textToken += (char) nextChar;
193             }
194             break;
195 
196         case StringEscapeState:
197             if (nextChar == '\\')
198             {
199                 textToken += '\\';
200                 state = StringState;
201             }
202             else if (nextChar == 'n')
203             {
204                 textToken += '\n';
205                 state = StringState;
206             }
207             else if (nextChar == '"')
208             {
209                 textToken += '"';
210                 state = StringState;
211             }
212             else if (nextChar == 'u')
213             {
214                 unicodeValue = 0;
215                 unicodeEscapeDigits = 0;
216                 state = UnicodeEscapeState;
217             }
218             else
219             {
220                 newToken = TokenError;
221                 syntaxError("Unknown escape code in string");
222                 state = StringState;
223             }
224             break;
225 
226         case NumberState:
227             if (isdigit(nextChar))
228             {
229                 state = NumberState;
230                 integerValue = integerValue * 10 + (int) nextChar - (int) '0';
231             }
232             else if (nextChar == '.')
233             {
234                 state = FractionState;
235             }
236             else if (nextChar == 'e' || nextChar == 'E')
237             {
238                 state = ExponentFirstState;
239             }
240             else if (issep(nextChar))
241             {
242                 newToken = TokenNumber;
243                 haveValidNumber = true;
244             }
245             else
246             {
247                 newToken = TokenError;
248                 syntaxError("Bad character in number");
249             }
250             break;
251 
252         case FractionState:
253             if (isdigit(nextChar))
254             {
255                 state = FractionState;
256                 fractionValue = fractionValue * 10 + nextChar - (int) '0';
257                 fracExp *= 10;
258             }
259             else if (nextChar == 'e' || nextChar == 'E')
260             {
261                 state = ExponentFirstState;
262             }
263             else if (issep(nextChar))
264             {
265                 newToken = TokenNumber;
266                 haveValidNumber = true;
267             } else {
268                 newToken = TokenError;
269                 syntaxError("Bad character in number");
270             }
271             break;
272 
273         case ExponentFirstState:
274             if (isdigit(nextChar))
275             {
276                 state = ExponentState;
277                 exponentValue = (int) nextChar - (int) '0';
278             }
279             else if (nextChar == '-')
280             {
281                 state = ExponentState;
282                 exponentSign = -1;
283             }
284             else if (nextChar == '+')
285             {
286                 state = ExponentState;
287             }
288             else
289             {
290                 state = ErrorState;
291                 syntaxError("Bad character in number");
292             }
293             break;
294 
295         case ExponentState:
296             if (isdigit(nextChar))
297             {
298                 state = ExponentState;
299                 exponentValue = exponentValue * 10 + (int) nextChar - (int) '0';
300             }
301             else if (issep(nextChar))
302             {
303                 newToken = TokenNumber;
304                 haveValidNumber = true;
305             }
306             else
307             {
308                 state = ErrorState;
309                 syntaxError("Bad character in number");
310             }
311             break;
312 
313         case DotState:
314             if (isdigit(nextChar))
315             {
316                 state = FractionState;
317                 fractionValue = fractionValue * 10 + (int) nextChar - (int) '0';
318                 fracExp = 10;
319             }
320             else
321             {
322                 state = ErrorState;
323                 syntaxError("'.' in stupid place");
324             }
325             break;
326 
327         case UnicodeEscapeState:
328             if (isxdigit(nextChar))
329             {
330                 unsigned int digitValue;
331                 if (nextChar >= 'a' && nextChar <= 'f')
332                     digitValue = nextChar - 'a' + 10;
333                 else if (nextChar >= 'A' && nextChar <= 'F')
334                     digitValue = nextChar - 'A' + 10;
335                 else
336                     digitValue = nextChar - '0';
337                 unicodeValue = (unicodeValue << 4) + digitValue;
338                 unicodeEscapeDigits++;
339                 if (unicodeEscapeDigits == 4)
340                 {
341                     char utf8Encoded[7];
342                     UTF8Encode((wchar_t) unicodeValue, utf8Encoded);
343                     textToken += utf8Encoded;
344                     state = StringState;
345                 }
346             }
347             else
348             {
349                 state = ErrorState;
350                 syntaxError("Bad Unicode escape in string");
351             }
352             break;
353 
354         case ErrorState:    break;  // Prevent GCC4 warnings; do nothing
355 
356         } // Switch
357 
358         if (newToken == TokenBegin)
359         {
360             nextChar = readChar();
361         }
362     }
363 
364     tokenType = newToken;
365     if (haveValidNumber)
366     {
367         numberValue = integerValue + fractionValue / fracExp;
368         if (exponentValue != 0)
369             numberValue *= pow(10.0, exponentValue * exponentSign);
370         numberValue *= sign;
371     }
372 
373     return tokenType;
374 }
375 
376 
getTokenType()377 Tokenizer::TokenType Tokenizer::getTokenType()
378 {
379     return tokenType;
380 }
381 
382 
pushBack()383 void Tokenizer::pushBack()
384 {
385     pushedBack = true;
386 }
387 
388 
getNumberValue()389 double Tokenizer::getNumberValue()
390 {
391     return numberValue;
392 }
393 
394 
getNameValue()395 string Tokenizer::getNameValue()
396 {
397     return textToken;
398 }
399 
400 
getStringValue()401 string Tokenizer::getStringValue()
402 {
403     return textToken;
404 }
405 
406 
readChar()407 int Tokenizer::readChar()
408 {
409     int c = (int) in->get();
410     if (c == '\n')
411         lineNum++;
412 
413     return c;
414 }
415 
syntaxError(const char * message)416 void Tokenizer::syntaxError(const char* message)
417 {
418     cerr << message << '\n';
419 }
420 
421 
getLineNumber() const422 int Tokenizer::getLineNumber() const
423 {
424     return lineNum;
425 }
426 
427 #if 0
428 // Tokenizer test
429 int main(int argc, char *argv[])
430 {
431     Tokenizer tokenizer(&cin);
432     Tokenizer::TokenType tok = Tokenizer::TokenBegin;
433 
434     while (tok != Tokenizer::TokenEnd)
435     {
436         tok = tokenizer.nextToken();
437         switch (tok)
438         {
439         case Tokenizer::TokenBegin:
440             cout << "Begin";
441             break;
442         case Tokenizer::TokenEnd:
443             cout << "End";
444             break;
445         case Tokenizer::TokenName:
446             cout << "Name = " << tokenizer.getNameValue();
447             break;
448         case Tokenizer::TokenNumber:
449             cout << "Number = " << tokenizer.getNumberValue();
450             break;
451         case Tokenizer::TokenString:
452             cout << "String = " << '"' << tokenizer.getStringValue() << '"';
453             break;
454         case Tokenizer::TokenBeginGroup:
455             cout << '{';
456             break;
457         case Tokenizer::TokenEndGroup:
458             cout << '}';
459             break;
460         case Tokenizer::TokenEquals:
461             cout << '=';
462             break;
463         default:
464             cout << "Other";
465             break;
466         }
467 
468         cout << '\n';
469     }
470 
471     return 0;
472 }
473 #endif
474