1 // tokenizer.cpp
2 //
3 // Copyright (C) 2001 Chris Laurel <claurel@shatters.net>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License
7 // as published by the Free Software Foundation; either version 2
8 // of the License, or (at your option) any later version.
9
10 #include <cctype>
11 #include <cmath>
12 #include <iomanip>
13 #include <celutil/utf8.h>
14 #include "tokenizer.h"
15
16
issep(char c)17 static bool issep(char c)
18 {
19 return !isdigit(c) && !isalpha(c) && c != '.';
20 }
21
22
Tokenizer(istream * _in)23 Tokenizer::Tokenizer(istream* _in) :
24 in(_in),
25 tokenType(TokenBegin),
26 haveValidNumber(false),
27 haveValidName(false),
28 haveValidString(false),
29 pushedBack(false),
30 lineNum(1)
31 {
32 }
33
34
nextToken()35 Tokenizer::TokenType Tokenizer::nextToken()
36 {
37 State state = StartState;
38
39 if (pushedBack)
40 {
41 pushedBack = false;
42 return tokenType;
43 }
44
45 textToken = "";
46 haveValidNumber = false;
47 haveValidName = false;
48 haveValidString = false;
49
50 if (tokenType == TokenBegin)
51 {
52 nextChar = readChar();
53 if (in->eof())
54 return TokenEnd;
55 }
56 else if (tokenType == TokenEnd)
57 {
58 return tokenType;
59 }
60
61 double integerValue = 0;
62 double fractionValue = 0;
63 double sign = 1;
64 double fracExp = 1;
65 double exponentValue = 0;
66 double exponentSign = 1;
67
68 TokenType newToken = TokenBegin;
69 while (newToken == TokenBegin)
70 {
71 switch (state)
72 {
73 case StartState:
74 if (isspace(nextChar))
75 {
76 state = StartState;
77 }
78 else if (isdigit(nextChar))
79 {
80 state = NumberState;
81 integerValue = (int) nextChar - (int) '0';
82 }
83 else if (nextChar == '-')
84 {
85 state = NumberState;
86 sign = -1;
87 integerValue = 0;
88 }
89 else if (nextChar == '+')
90 {
91 state = NumberState;
92 sign = +1;
93 integerValue = 0;
94 }
95 else if (nextChar == '.')
96 {
97 state = FractionState;
98 sign = +1;
99 integerValue = 0;
100 }
101 else if (isalpha(nextChar) || nextChar == '_')
102 {
103 state = NameState;
104 textToken += (char) nextChar;
105 }
106 else if (nextChar == '#')
107 {
108 state = CommentState;
109 }
110 else if (nextChar == '"')
111 {
112 state = StringState;
113 }
114 else if (nextChar == '{')
115 {
116 newToken = TokenBeginGroup;
117 nextChar = readChar();
118 }
119 else if (nextChar == '}')
120 {
121 newToken = TokenEndGroup;
122 nextChar = readChar();
123 }
124 else if (nextChar == '[')
125 {
126 newToken = TokenBeginArray;
127 nextChar = readChar();
128 }
129 else if (nextChar == ']')
130 {
131 newToken = TokenEndArray;
132 nextChar = readChar();
133 }
134 else if (nextChar == '=')
135 {
136 newToken = TokenEquals;
137 nextChar = readChar();
138 }
139 else if (nextChar == '|')
140 {
141 newToken = TokenBar;
142 nextChar = readChar();
143 }
144 else if (nextChar == -1)
145 {
146 newToken = TokenEnd;
147 }
148 else
149 {
150 newToken = TokenError;
151 syntaxError("Bad character in stream");
152 }
153 break;
154
155 case NameState:
156 if (isalpha(nextChar) || isdigit(nextChar) || nextChar == '_')
157 {
158 state = NameState;
159 textToken += (char) nextChar;
160 }
161 else
162 {
163 newToken = TokenName;
164 haveValidName = true;
165 }
166 break;
167
168 case CommentState:
169 if (nextChar == '\n' || nextChar == '\r' || nextChar == char_traits<char>::eof())
170 state = StartState;
171 break;
172
173 case StringState:
174 if (nextChar == '"')
175 {
176 newToken = TokenString;
177 haveValidString = true;
178 nextChar = readChar();
179 }
180 else if (nextChar == '\\')
181 {
182 state = StringEscapeState;
183 }
184 else if (nextChar == char_traits<char>::eof())
185 {
186 newToken = TokenError;
187 syntaxError("Unterminated string");
188 }
189 else
190 {
191 state = StringState;
192 textToken += (char) nextChar;
193 }
194 break;
195
196 case StringEscapeState:
197 if (nextChar == '\\')
198 {
199 textToken += '\\';
200 state = StringState;
201 }
202 else if (nextChar == 'n')
203 {
204 textToken += '\n';
205 state = StringState;
206 }
207 else if (nextChar == '"')
208 {
209 textToken += '"';
210 state = StringState;
211 }
212 else if (nextChar == 'u')
213 {
214 unicodeValue = 0;
215 unicodeEscapeDigits = 0;
216 state = UnicodeEscapeState;
217 }
218 else
219 {
220 newToken = TokenError;
221 syntaxError("Unknown escape code in string");
222 state = StringState;
223 }
224 break;
225
226 case NumberState:
227 if (isdigit(nextChar))
228 {
229 state = NumberState;
230 integerValue = integerValue * 10 + (int) nextChar - (int) '0';
231 }
232 else if (nextChar == '.')
233 {
234 state = FractionState;
235 }
236 else if (nextChar == 'e' || nextChar == 'E')
237 {
238 state = ExponentFirstState;
239 }
240 else if (issep(nextChar))
241 {
242 newToken = TokenNumber;
243 haveValidNumber = true;
244 }
245 else
246 {
247 newToken = TokenError;
248 syntaxError("Bad character in number");
249 }
250 break;
251
252 case FractionState:
253 if (isdigit(nextChar))
254 {
255 state = FractionState;
256 fractionValue = fractionValue * 10 + nextChar - (int) '0';
257 fracExp *= 10;
258 }
259 else if (nextChar == 'e' || nextChar == 'E')
260 {
261 state = ExponentFirstState;
262 }
263 else if (issep(nextChar))
264 {
265 newToken = TokenNumber;
266 haveValidNumber = true;
267 } else {
268 newToken = TokenError;
269 syntaxError("Bad character in number");
270 }
271 break;
272
273 case ExponentFirstState:
274 if (isdigit(nextChar))
275 {
276 state = ExponentState;
277 exponentValue = (int) nextChar - (int) '0';
278 }
279 else if (nextChar == '-')
280 {
281 state = ExponentState;
282 exponentSign = -1;
283 }
284 else if (nextChar == '+')
285 {
286 state = ExponentState;
287 }
288 else
289 {
290 state = ErrorState;
291 syntaxError("Bad character in number");
292 }
293 break;
294
295 case ExponentState:
296 if (isdigit(nextChar))
297 {
298 state = ExponentState;
299 exponentValue = exponentValue * 10 + (int) nextChar - (int) '0';
300 }
301 else if (issep(nextChar))
302 {
303 newToken = TokenNumber;
304 haveValidNumber = true;
305 }
306 else
307 {
308 state = ErrorState;
309 syntaxError("Bad character in number");
310 }
311 break;
312
313 case DotState:
314 if (isdigit(nextChar))
315 {
316 state = FractionState;
317 fractionValue = fractionValue * 10 + (int) nextChar - (int) '0';
318 fracExp = 10;
319 }
320 else
321 {
322 state = ErrorState;
323 syntaxError("'.' in stupid place");
324 }
325 break;
326
327 case UnicodeEscapeState:
328 if (isxdigit(nextChar))
329 {
330 unsigned int digitValue;
331 if (nextChar >= 'a' && nextChar <= 'f')
332 digitValue = nextChar - 'a' + 10;
333 else if (nextChar >= 'A' && nextChar <= 'F')
334 digitValue = nextChar - 'A' + 10;
335 else
336 digitValue = nextChar - '0';
337 unicodeValue = (unicodeValue << 4) + digitValue;
338 unicodeEscapeDigits++;
339 if (unicodeEscapeDigits == 4)
340 {
341 char utf8Encoded[7];
342 UTF8Encode((wchar_t) unicodeValue, utf8Encoded);
343 textToken += utf8Encoded;
344 state = StringState;
345 }
346 }
347 else
348 {
349 state = ErrorState;
350 syntaxError("Bad Unicode escape in string");
351 }
352 break;
353
354 case ErrorState: break; // Prevent GCC4 warnings; do nothing
355
356 } // Switch
357
358 if (newToken == TokenBegin)
359 {
360 nextChar = readChar();
361 }
362 }
363
364 tokenType = newToken;
365 if (haveValidNumber)
366 {
367 numberValue = integerValue + fractionValue / fracExp;
368 if (exponentValue != 0)
369 numberValue *= pow(10.0, exponentValue * exponentSign);
370 numberValue *= sign;
371 }
372
373 return tokenType;
374 }
375
376
getTokenType()377 Tokenizer::TokenType Tokenizer::getTokenType()
378 {
379 return tokenType;
380 }
381
382
pushBack()383 void Tokenizer::pushBack()
384 {
385 pushedBack = true;
386 }
387
388
getNumberValue()389 double Tokenizer::getNumberValue()
390 {
391 return numberValue;
392 }
393
394
getNameValue()395 string Tokenizer::getNameValue()
396 {
397 return textToken;
398 }
399
400
getStringValue()401 string Tokenizer::getStringValue()
402 {
403 return textToken;
404 }
405
406
readChar()407 int Tokenizer::readChar()
408 {
409 int c = (int) in->get();
410 if (c == '\n')
411 lineNum++;
412
413 return c;
414 }
415
syntaxError(const char * message)416 void Tokenizer::syntaxError(const char* message)
417 {
418 cerr << message << '\n';
419 }
420
421
getLineNumber() const422 int Tokenizer::getLineNumber() const
423 {
424 return lineNum;
425 }
426
427 #if 0
428 // Tokenizer test
429 int main(int argc, char *argv[])
430 {
431 Tokenizer tokenizer(&cin);
432 Tokenizer::TokenType tok = Tokenizer::TokenBegin;
433
434 while (tok != Tokenizer::TokenEnd)
435 {
436 tok = tokenizer.nextToken();
437 switch (tok)
438 {
439 case Tokenizer::TokenBegin:
440 cout << "Begin";
441 break;
442 case Tokenizer::TokenEnd:
443 cout << "End";
444 break;
445 case Tokenizer::TokenName:
446 cout << "Name = " << tokenizer.getNameValue();
447 break;
448 case Tokenizer::TokenNumber:
449 cout << "Number = " << tokenizer.getNumberValue();
450 break;
451 case Tokenizer::TokenString:
452 cout << "String = " << '"' << tokenizer.getStringValue() << '"';
453 break;
454 case Tokenizer::TokenBeginGroup:
455 cout << '{';
456 break;
457 case Tokenizer::TokenEndGroup:
458 cout << '}';
459 break;
460 case Tokenizer::TokenEquals:
461 cout << '=';
462 break;
463 default:
464 cout << "Other";
465 break;
466 }
467
468 cout << '\n';
469 }
470
471 return 0;
472 }
473 #endif
474