1 #include "CppBase.h"
2 #include "Internal.h"
3
4 // #define LOGNEXT _DBG_
5
6 namespace Upp {
7
8 #ifdef _MSC_VER
9 #pragma inline_depth(255)
10 #pragma optimize("t", on)
11 #endif
12
13 #define case_id \
14 case '_':case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':case 'g':case 'h': \
15 case 'i':case 'j':case 'k':case 'l':case 'm':case 'n':case 'o':case 'p':case 'q': \
16 case 'r':case 's':case 't':case 'u':case 'v':case 'w':case 'x':case 'y':case 'z': \
17 case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':case 'G':case 'H':case 'I': \
18 case 'J':case 'K':case 'L':case 'M':case 'N':case 'O':case 'P':case 'Q':case 'R': \
19 case 'S':case 'T':case 'U':case 'V':case 'W':case 'X':case 'Y':case 'Z':case '$'
20
21 #define case_nonzero_digit \
22 case '1':case '2':case '3':case '4':case '5':case '6':case '7':case '8':case '9'
23
24
25 const char *_CppKeyword[] = {
26 #define CPPID(x) #x,
27 #include "keyword.i"
28 #undef CPPID
29 NULL
30 };
31
CppKeyword()32 const char **CppKeyword() { return _CppKeyword; }
33
34
LexSymbolStat()35 LexSymbolStat::LexSymbolStat() :
36 minSymbol(0)
37 {
38 }
39
Reset(int minSymbol,int maxSymbol)40 void LexSymbolStat::Reset(int minSymbol, int maxSymbol)
41 {
42 ASSERT(minSymbol <= maxSymbol);
43 v.Clear();
44 this->minSymbol = minSymbol;
45 v.SetCount(maxSymbol - minSymbol + 1, 0);
46 }
47
IncStat(int symbol)48 void LexSymbolStat::IncStat(int symbol)
49 {
50 int symbolIndex = symbol - minSymbol;
51 if(symbolIndex >= 0 && symbolIndex < v.GetCount())
52 v[symbolIndex]++;
53 }
54
GetStat(int symbol) const55 int LexSymbolStat::GetStat(int symbol) const
56 {
57 int symbolIndex = symbol - minSymbol;
58 return (symbolIndex >= 0 && symbolIndex < v.GetCount()) ?
59 v[symbolIndex] :
60 0;
61 }
62
SumStat(const Vector<int> & symbols) const63 int LexSymbolStat::SumStat(const Vector<int> & symbols) const
64 {
65 int sum = 0;
66 for(int i = 0; i < symbols.GetCount(); i++)
67 sum += GetStat(symbols[i]);
68 return sum;
69 }
70
Merge(const LexSymbolStat & other)71 void LexSymbolStat::Merge(const LexSymbolStat & other)
72 {
73 if(v.GetCount() == 0) {
74 minSymbol = other.minSymbol;
75 v <<= other.v;
76 return;
77 }
78 ASSERT(other.minSymbol == minSymbol && other.v.GetCount() == v.GetCount());
79 for(int i = 0; i < v.GetCount(); i++)
80 v[i] += other.v[i];
81 }
82
83
Lex()84 Lex::Lex()
85 : statsCollected(false)
86 {
87 const char **cppk = CppKeyword();
88 for(int i = 0; cppk[i]; i++)
89 id.Add(cppk[i]);
90 endkey = id.GetCount();
91 braceslevel = body = 0;
92 }
93
Init(const char * s)94 void Lex::Init(const char *s)
95 {
96 ptr = s;
97 }
98
StartStatCollection()99 void Lex::StartStatCollection()
100 {
101 symbolStat.Reset(-200, endkey+256);
102 statsCollected = true;
103 }
104
FinishStatCollection()105 const LexSymbolStat& Lex::FinishStatCollection()
106 {
107 statsCollected = false;
108 return symbolStat;
109 }
110
GetCharacter()111 int Lex::GetCharacter()
112 {
113 if(*ptr == '\0') return t_eof;
114 int c = *ptr++;
115 if(c == '\\') {
116 c = *ptr++;
117 switch(c) {
118 case 'a': return '\a';
119 case 'b': return '\b';
120 case 't': return '\t';
121 case 'v': return '\v';
122 case 'n': return '\n';
123 case 'r': return '\r';
124 case 'f': return '\f';
125 case 'x':
126 c = 0;
127 if(isxdigit(*ptr)) {
128 c = (*ptr >= 'A' ? ToUpper(*ptr) - 'A' + 10 : *ptr - '0');
129 ptr++;
130 if(isxdigit(*ptr)) {
131 c = 16 * c + (*ptr >= 'A' ? ToUpper(*ptr) - 'A' + 10 : *ptr - '0');
132 ptr++;
133 }
134 }
135 break;
136 default:
137 if(c >= '0' && c <= '7') {
138 c -= '0';
139 if(*ptr >= '0' && *ptr <= '7')
140 c = 8 * c + *ptr++ - '0';
141 if(*ptr >= '0' && *ptr <= '7')
142 c = 8 * c + *ptr++ - '0';
143 }
144 }
145 }
146 return (byte)c;
147 }
148
Next()149 void Lex::Next()
150 {
151 grounding = false;
152 while((byte)*ptr <= ' ') {
153 if(*ptr == '\2')
154 grounding = true;
155 if(*ptr == '\0') return;
156 ptr++;
157 }
158 pos = ptr;
159 int c = (byte)*ptr++;
160 if(c == '\0') return;
161 switch(c) {
162 case_id: {
163 const char *b = ptr - 1;
164 while(iscid(*ptr))
165 ptr++;
166 String x(b, ptr);
167 int q = id.FindAdd(x);
168 if(q == tk_rval_ - 256) { // simple hack for old rval macro
169 AddCode('&');
170 AddCode('&');
171 }
172 else
173 AddCode(q + 256);
174 break;
175 }
176 case ':': AddCode(Char(':') ? t_dblcolon : ':'); break;
177 case '*': AssOp('*', t_mulass); break;
178 case '/': AssOp('/', t_divass); break;
179 case '%': AssOp('%', t_modass); break;
180 case '^': AssOp('^', t_xorass); break;
181 case '!': AssOp('!', t_neq); break;
182 case '.':
183 if(Char('*')) AddCode(t_dot_asteriks);
184 else
185 if(*ptr == '.' && ptr[1] == '.') {
186 AddCode(t_elipsis);
187 ptr += 2;
188 }
189 else
190 AddCode('.');
191 break;
192 case '+':
193 if(Char('+')) AddCode(t_inc);
194 else
195 AssOp('+', t_addass);
196 return;
197 case '-':
198 if(Char('-')) AddCode(t_dec);
199 else
200 if(Char('>'))
201 AddCode(Char('*') ? t_arrow_asteriks : t_arrow);
202 else
203 AssOp('-', t_subass);
204 break;
205 case '&':
206 if(Char('&'))
207 AddCode(t_and);
208 else
209 AssOp('&', t_andass);
210 break;
211 case '|':
212 if(Char('|'))
213 AddCode(t_or);
214 else
215 AssOp('|', t_orass);
216 break;
217 case '=':
218 AssOp('=', t_eq);
219 break;
220 case '<':
221 if(Char('<'))
222 AssOp(t_shl, t_shlass);
223 else
224 AssOp('<', t_le);
225 break;
226 case '>':
227 if(Char('>'))
228 AssOp(t_shr, t_shrass);
229 else
230 AssOp('>', t_ge);
231 break;
232 case '0': {
233 dword w = 0;
234 if(Char('x') || Char('X')) {
235 for(;;) {
236 int d;
237 if(*ptr >= '0' && *ptr <= '9')
238 d = *ptr - '0';
239 else
240 if(*ptr >= 'A' && *ptr <= 'F')
241 d = *ptr - 'A' + 10;
242 else
243 if(*ptr >= 'a' && *ptr <= 'f')
244 d = *ptr - 'a' + 10;
245 else
246 break;
247 if(w >= 0x8000000u - d) {
248 AddCode(te_integeroverflow);
249 return;
250 }
251 w = w * 16 + d - '0';
252 ptr++;
253 }
254 }
255 else
256 while(*ptr >= '0' && *ptr <= '7') {
257 int d = *ptr++ - '0';
258 if(w >= 0x1000000u - d) {
259 AddCode(te_integeroverflow);
260 return;
261 }
262 w = w * 8 + d - '0';
263 }
264 Term& tm = term.AddTail();
265 tm.code = t_integer;
266 tm.ptr = pos;
267 tm.number = w;
268 }
269 break;
270 case_nonzero_digit: {
271 double w = c - '0';
272 bool fp = false;
273 while(*ptr >= '0' && *ptr <= '9')
274 w = w * 10 + *ptr++ - '0';
275 if(*ptr == '.') { //TODO TO BE Completed !!!
276 fp = true;
277 ptr++;
278 double x = 0.1;
279 while(*ptr >= '0' && *ptr <= '9') {
280 w += x * (*ptr++ - '0');
281 x /= 10;
282 }
283 }
284 Term& tm = term.AddTail();
285 if(fp || w < INT_MIN || w > INT_MAX)
286 tm.code = t_double;
287 else
288 tm.code = t_integer;
289 tm.ptr = pos;
290 tm.number = w;
291 }
292 break;
293 case '\'': {
294 Term& tm = term.AddTail();
295 tm.code = t_character;
296 tm.ptr = pos;
297 tm.text = String(GetCharacter(), 1);
298 if(*ptr == '\'')
299 ptr++;
300 else
301 tm.code = te_badcharacter;
302 }
303 break;
304 case '\"': {
305 Term& tm = term.AddTail();
306 tm.code = t_string;
307 tm.ptr = pos;
308 for(;;) {
309 while(*ptr != '\"') {
310 if((byte)*ptr < ' ' && *ptr != 9) {
311 tm.code = te_badstring;
312 return;
313 }
314 tm.text.Cat(GetCharacter());
315 }
316 ptr++;
317 while(*ptr && (byte)*ptr <= ' ') ptr++;
318 if(*ptr != '\"') break;
319 ptr++;
320 }
321 }
322 break;
323 default:
324 AddCode(c);
325 return;
326 }
327 }
328
Prepare(int pos)329 bool Lex::Prepare(int pos) {
330 while(term.GetCount() <= pos) {
331 if(*ptr == '\0') return false;
332 Next();
333 }
334 #ifdef _DEBUG
335 pp = term[0].ptr;
336 #endif
337 return true;
338 }
339
Code(int pos)340 int Lex::Code(int pos)
341 {
342 if(!Prepare(pos)) return t_eof;
343 return term[pos].code;
344 }
345
IsId(int pos)346 bool Lex::IsId(int pos)
347 {
348 return Code(pos) >= endkey + 256;
349 }
350
ThrowError(const char * e)351 void Lex::ThrowError(const char *e)
352 {
353 WhenError(e);
354 throw Parser::Error();
355 }
356
Id(int pos)357 String Lex::Id(int pos)
358 {
359 if(!IsId(pos))
360 ThrowError("expected id");
361 return id[Code(pos) - 256];
362 }
363
Get(int n)364 void Lex::Get(int n)
365 {
366 while(n--) {
367 if(term.GetCount()) {
368 if(body && term.Head().grounding)
369 throw Grounding();
370 int chr = term.Head().code;
371 if(statsCollected)
372 symbolStat.IncStat(chr);
373 if(chr == '{')
374 braceslevel++;
375 else
376 if(chr == '}')
377 braceslevel--;
378 term.DropHead();
379 }
380 if(term.GetCount() == 0)
381 Next();
382 if(term.GetCount() == 0)
383 break;
384 }
385 #ifdef LOGNEXT
386 Dump(0);
387 #endif
388 }
389
Dump(int pos)390 void Lex::Dump(int pos)
391 {
392 #ifdef LOGNEXT
393 int code = Code(pos);
394 switch(code) {
395 case t_string: LOG(AsCString(Text(pos))); break;
396 case t_double: LOG(Double(pos)); break;
397 case t_integer: LOG(Int(pos)); break;
398 case t_character: LOG("char " << AsCString(String(Chr(pos), 1))); break;
399 default:
400 if(code < 0)
401 LOG(decode(Code(),
402 t_dblcolon, "::",
403 t_mulass, "*=",
404 t_divass, "/=",
405 t_modass, "%=",
406 t_xorass, "^=",
407 t_neq, "!=",
408 t_dot_asteriks, ".*",
409 t_elipsis, "...",
410 t_inc, "++",
411 t_addass, "+=",
412 t_dec, "--",
413 t_arrow_asteriks, "->*",
414 t_arrow, "->",
415 t_subass, "-=",
416 t_and, "&&",
417 t_andass, "&=",
418 t_or, "||",
419 t_orass, "|=",
420 t_eq, "==",
421 t_shl, "<<",
422 t_shlass, "<<=",
423 t_le, "<=",
424 t_shr, ">>",
425 t_shrass, ">>=",
426 t_ge, ">=",
427 te_integeroverflow, "<integer overflow>",
428 te_badcharacter, "<bad char>",
429 te_badstring, "<bad string>",
430 "???"));
431 else
432 if(code < 256)
433 LOG((char)code);
434 else
435 LOG(id[code - 256]);
436 }
437 #endif
438 }
439
SkipToGrounding()440 void Lex::SkipToGrounding()
441 {
442 for(;;) {
443 if(term.GetCount() == 0)
444 Next();
445 if(term.GetCount() == 0)
446 break;
447 int chr = term.Head().code;
448 if(chr == t_eof)
449 return;
450 if(term.Head().grounding)
451 return;
452 if(chr == '{')
453 braceslevel++;
454 else
455 if(chr == '}')
456 braceslevel--;
457 term.DropHead();
458 }
459 }
460
Int(int pos)461 int Lex::Int(int pos)
462 {
463 Prepare(pos);
464 if(term[pos].code != t_integer)
465 ThrowError("expected integer literal");
466 return (int)term[pos].number;
467 }
468
Double(int pos)469 double Lex::Double(int pos)
470 {
471 Prepare(pos);
472 if(term[pos].code != t_double)
473 ThrowError("expected floating point literal");
474 return term[pos].number;
475 }
476
Text(int pos)477 String Lex::Text(int pos)
478 {
479 Prepare(pos);
480 if(term[pos].code != t_string)
481 ThrowError("expected string literal");
482 return term[pos].text;
483 }
484
Chr(int pos)485 int Lex::Chr(int pos)
486 {
487 Prepare(pos);
488 if(term[pos].code != t_character)
489 ThrowError("expected character literal");
490 return (byte)*term[pos].text;
491 }
492
Pos(int pos)493 const char *Lex::Pos(int pos)
494 {
495 Prepare(pos);
496 return pos < term.GetCount() ? term[pos].ptr : ptr;
497 }
498
499 }
500