1 /*
2 Copyright (C) 2003 - 2018 by David White <dave@whitevine.net>
3 Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY.
11
12 See the COPYING file for more details.
13 */
14
15 #include "formula/tokenizer.hpp"
16
17 #include <locale>
18 #include <sstream>
19
20 namespace wfl
21 {
22 namespace tokenizer
23 {
24
25 namespace {
26
raise_exception(iterator & i1,iterator i2,std::string str)27 void raise_exception(iterator& i1, iterator i2, std::string str) {
28 std::ostringstream expr;
29 while( (i1 != i2) && (*i1 != '\n') ) {
30 if( (*i1 != '\t') )
31 expr << *i1;
32 ++i1;
33 }
34
35 if( str.empty() )
36 throw token_error("Unrecognized token", expr.str() );
37 else
38 throw token_error(str, expr.str() );
39 }
40
41 }
42
get_token(iterator & i1,const iterator i2)43 token get_token(iterator& i1, const iterator i2) {
44
45 iterator it = i1;
46 if( *i1 >= 'A' ) {
47 //current character is >= 'A', limit search to the upper-half of the ASCII table
48
49 // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
50 if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
51
52 while(i1 != i2 && (std::isalpha(*i1, std::locale::classic()) || *i1 == '_'))
53 ++i1;
54
55 int diff = i1 - it;
56 TOKEN_TYPE t = TOKEN_IDENTIFIER;
57
58 //check if this string matches any keyword or an operator
59 //possible operators and keywords:
60 // d, or, in, def, and, not, wfl, where, wflend, functions
61 if( diff == 1 ) {
62 if( *it == 'd' )
63 t = TOKEN_OPERATOR;
64 } else if( diff == 2 ) {
65 if( *it == 'o' && *(it+1) == 'r' )
66 t = TOKEN_OPERATOR;
67 else if( *it == 'i' && *(it+1) == 'n' )
68 t = TOKEN_OPERATOR;
69 } else if( diff == 3 ) {
70 if( *it == 'd' ) { //def
71 if( *(it+1) == 'e' && *(it+2) == 'f' )
72 t = TOKEN_KEYWORD;
73 } else if( *it == 'a' ) { //and
74 if( *(it+1) == 'n' && *(it+2) == 'd' )
75 t = TOKEN_OPERATOR;
76 } else if( *it == 'n' ) { //not
77 if( *(it+1) == 'o' && *(it+2) == 't' )
78 t = TOKEN_OPERATOR;
79 } else if( *it == 'f' ) { //fai
80 if( *(it+1) == 'a' && *(it+2) == 'i' )
81 t = TOKEN_KEYWORD;
82 } else if( *it == 'w' ) { //wfl
83 if( *(it+1) == 'f' && *(it+2) == 'l' )
84 t = TOKEN_KEYWORD;
85 }
86 } else if( diff == 5 ) {
87 std::string s(it, i1);
88 if( s == "where" )
89 t = TOKEN_OPERATOR;
90 } else if( diff == 6 ) {
91 std::string s(it, i1);
92 if( s == "faiend" )
93 t = TOKEN_KEYWORD;
94 else if( s == "wflend" )
95 t = TOKEN_KEYWORD;
96 } else if( diff == 9 ) {
97 std::string s(it, i1);
98 if( s == "functions" )
99 t = TOKEN_KEYWORD;
100 }
101
102 return token( it, i1, t);
103 } else {
104 //at this point only 3 chars left to check:
105 if( *i1 == '[' )
106 return token( it, ++i1, TOKEN_LSQUARE );
107
108 if( *i1 == ']' )
109 return token( it, ++i1, TOKEN_RSQUARE );
110
111 if( *i1 == '^' )
112 return token( it, ++i1, TOKEN_OPERATOR );
113
114 if( *i1 == '~' )
115 return token( it, ++i1, TOKEN_OPERATOR );
116
117 //unused characters in this range:
118 // \ ` { | }
119 // Note: {} should never be used since they play poorly with WML preprocessor
120 }
121 } else {
122 //limit search to the lower-half of the ASCII table
123 //start by checking for whitespaces/end of line char
124 if( *i1 <= ' ' ) {
125 if( *i1 == '\n' ) {
126 return token( it, ++i1, TOKEN_EOL);
127 } else {
128
129 while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
130 ++i1;
131
132 return token( it, i1, TOKEN_WHITESPACE );
133 }
134 //try to further limit number of characters that we need to check:
135 } else if ( *i1 >= '0' ){
136 //current character is between '0' and '@'
137 if( *i1 <= '9' ) {
138 //we parse integer or decimal number
139 ++i1;
140 bool dot = false;
141
142 while( i1 != i2 ) {
143 if( *i1 >= '0' && *i1 <= '9' ) {
144 //do nothing
145 } else {
146 //look for '.' in case of decimal number
147 if( *i1 == '.' ) {
148 //allow only one dot in such expression
149 if( !dot )
150 dot = true;
151 else
152 raise_exception(it, i2, "Multiple dots near decimal expression");
153 } else
154 break;
155 }
156 ++i1;
157 }
158
159 if( dot )
160 return token( it, i1, TOKEN_DECIMAL );
161 else
162 return token( it, i1, TOKEN_INTEGER );
163
164 } else {
165 //current character is between ':' and '@'
166 //possible tokens at this point that we are interested in:
167 // ; < = > <= >=
168 //unused characters in this range:
169 // : ? @
170
171 if( *i1 == ';' ) {
172 return token( it, ++i1, TOKEN_SEMICOLON);
173 } else if( *i1 == '=' ) {
174 return token( it, ++i1, TOKEN_OPERATOR);
175 } else if( *i1 == '<' ) {
176 ++i1;
177 if( i1 != i2 ) {
178 if( *i1 == '=' )
179 return token( it, ++i1, TOKEN_OPERATOR);
180 else
181 return token( it, i1, TOKEN_OPERATOR);
182 } else
183 return token( it, i1, TOKEN_OPERATOR);
184 } else if( *i1 == '>' ) {
185 ++i1;
186 if( i1 != i2 ) {
187 if( *i1 == '=' )
188 return token( it, ++i1, TOKEN_OPERATOR);
189 else
190 return token( it, i1, TOKEN_OPERATOR);
191 } else
192 return token( it, i1, TOKEN_OPERATOR);
193 }
194 }
195 //current character is between '!' and '/'
196 //possible tokens:
197 // , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
198 //unused characters:
199 // ! " $ &
200 // ! is used only as part of !=
201 // Note: " should never be used since it plays poorly with WML
202 } else if ( *i1 == ',' ) {
203 return token( it, ++i1, TOKEN_COMMA);
204
205 } else if ( *i1 == '.' ) {
206 ++i1;
207
208 if( i1 != i2 ) {
209 if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
210 return token( it, ++i1, TOKEN_OPERATOR );
211 else
212 return token( it, i1, TOKEN_OPERATOR );
213 } else {
214 return token( it, i1, TOKEN_OPERATOR);
215 }
216
217 } else if ( *i1 == '(' ) {
218 return token( it, ++i1, TOKEN_LPARENS);
219
220 } else if ( *i1 == ')' ) {
221 return token( it, ++i1, TOKEN_RPARENS);
222
223 } else if ( *i1 == '\'' ) {
224 int bracket_depth = 0;
225 ++i1;
226 while (i1 != i2) {
227 if (*i1 == '[') {
228 bracket_depth++;
229 } else if(bracket_depth > 0 && *i1 == ']') {
230 bracket_depth--;
231 } else if(bracket_depth == 0 && *i1 == '\'') {
232 break;
233 }
234 ++i1;
235 }
236
237 if( i1 != i2 ) {
238 return token( it, ++i1, TOKEN_STRING_LITERAL );
239 } else {
240 raise_exception(it, i2, "Missing closing ' for formula string");
241 }
242
243 } else if ( *i1 == '#' ) {
244 ++i1;
245 while( i1 != i2 && *i1 != '#' )
246 ++i1;
247
248 if( i1 != i2 ) {
249 return token( it, ++i1, TOKEN_COMMENT );
250 } else {
251 raise_exception(it, i2, "Missing closing # for formula comment");
252 }
253
254 } else if ( *i1 == '+' ) {
255 return token( it, ++i1, TOKEN_OPERATOR);
256
257 } else if ( *i1 == '-' ) {
258 ++i1;
259
260 if( i1 != i2 ) {
261 if( *i1 == '>' )
262 return token( it, ++i1, TOKEN_POINTER );
263 else
264 return token( it, i1, TOKEN_OPERATOR );
265 } else {
266 return token( it, i1, TOKEN_OPERATOR);
267 }
268
269 } else if ( *i1 == '*' ) {
270 return token( it, ++i1, TOKEN_OPERATOR);
271
272 } else if ( *i1 == '/' ) {
273 return token( it, ++i1, TOKEN_OPERATOR);
274
275 } else if ( *i1 == '%' ) {
276 return token( it, ++i1, TOKEN_OPERATOR);
277
278 } else if ( *i1 == '!' ) {
279 ++i1;
280 if( *i1 == '=' )
281 return token( it, ++i1, TOKEN_OPERATOR);
282 else
283 raise_exception(it, i2, std::string() );
284 }
285 }
286 raise_exception(it, i2, std::string() );
287 return token();
288 }
289
290 }
291
292 }
293