1 /*
2    Copyright (C) 2003 - 2018 by David White <dave@whitevine.net>
3    Part of the Battle for Wesnoth Project https://www.wesnoth.org/
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2 of the License, or
8    (at your option) any later version.
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY.
11 
12    See the COPYING file for more details.
13 */
14 
15 #include "formula/tokenizer.hpp"
16 
17 #include <locale>
18 #include <sstream>
19 
20 namespace wfl
21 {
22 namespace tokenizer
23 {
24 
25 namespace {
26 
raise_exception(iterator & i1,iterator i2,std::string str)27 void raise_exception(iterator& i1, iterator i2, std::string str) {
28 	std::ostringstream expr;
29 	while( (i1 != i2) && (*i1 != '\n') ) {
30 		if( (*i1 != '\t') )
31 			expr << *i1;
32 		++i1;
33 	}
34 
35 	if( str.empty() )
36 		throw token_error("Unrecognized token", expr.str() );
37 	else
38 		throw token_error(str, expr.str() );
39 }
40 
41 }
42 
get_token(iterator & i1,const iterator i2)43 token get_token(iterator& i1, const iterator i2) {
44 
45 	iterator it = i1;
46 	if( *i1 >= 'A' ) {
47 		//current character is >= 'A', limit search to the upper-half of the ASCII table
48 
49 		// check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
50 		if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
51 
52 			while(i1 != i2 && (std::isalpha(*i1, std::locale::classic()) || *i1 == '_'))
53 				++i1;
54 
55 			int diff = i1 - it;
56 			TOKEN_TYPE t = TOKEN_IDENTIFIER;
57 
58 			//check if this string matches any keyword or an operator
59 			//possible operators and keywords:
60 			// d, or, in, def, and, not, wfl, where, wflend, functions
61 			if( diff == 1 ) {
62 				if( *it == 'd' )
63 					t = TOKEN_OPERATOR;
64 			} else if( diff == 2 ) {
65 				if( *it == 'o' && *(it+1) == 'r' )
66 					t = TOKEN_OPERATOR;
67 				else if( *it == 'i' && *(it+1) == 'n' )
68 					t = TOKEN_OPERATOR;
69 			} else if( diff == 3 ) {
70 				if( *it == 'd' ) { //def
71 					if( *(it+1) == 'e' && *(it+2) == 'f' )
72 						t = TOKEN_KEYWORD;
73 				} else if( *it == 'a' ) { //and
74 					if( *(it+1) == 'n' && *(it+2) == 'd' )
75 						t = TOKEN_OPERATOR;
76 				} else if( *it == 'n' ) { //not
77 					if( *(it+1) == 'o' && *(it+2) == 't' )
78 						t = TOKEN_OPERATOR;
79 				} else if( *it == 'f' ) { //fai
80 					if( *(it+1) == 'a' && *(it+2) == 'i' )
81 						t = TOKEN_KEYWORD;
82 				} else if( *it == 'w' ) { //wfl
83 					if( *(it+1) == 'f' && *(it+2) == 'l' )
84 						t = TOKEN_KEYWORD;
85 				}
86 			} else if( diff == 5 ) {
87 				std::string s(it, i1);
88 				if( s == "where" )
89 					t = TOKEN_OPERATOR;
90 			} else if( diff == 6 ) {
91 				std::string s(it, i1);
92 				if( s == "faiend" )
93 					t = TOKEN_KEYWORD;
94 				else if( s == "wflend" )
95 					t = TOKEN_KEYWORD;
96 			} else if( diff == 9 ) {
97 				std::string s(it, i1);
98 				if( s == "functions" )
99 					t = TOKEN_KEYWORD;
100 			}
101 
102 			return token( it, i1, t);
103 		} else {
104 			//at this point only 3 chars left to check:
105 			if( *i1 == '[' )
106 				return token( it, ++i1, TOKEN_LSQUARE );
107 
108 			if( *i1 == ']' )
109 				return token( it, ++i1, TOKEN_RSQUARE );
110 
111 			if( *i1 == '^' )
112 				return token( it, ++i1, TOKEN_OPERATOR );
113 
114 			if( *i1 == '~' )
115 				return token( it, ++i1, TOKEN_OPERATOR );
116 
117 			//unused characters in this range:
118 			// \ ` { | }
119 			// Note: {} should never be used since they play poorly with WML preprocessor
120 		}
121 	} else {
122 		//limit search to the lower-half of the ASCII table
123 		//start by checking for whitespaces/end of line char
124 		if( *i1 <= ' ' ) {
125 			if( *i1 == '\n' ) {
126 				return token( it, ++i1, TOKEN_EOL);
127 			} else {
128 
129 				while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
130 					++i1;
131 
132 				return token( it, i1, TOKEN_WHITESPACE );
133 			}
134 		//try to further limit number of characters that we need to check:
135 		} else if ( *i1 >= '0' ){
136 			//current character is between '0' and '@'
137 			if( *i1 <= '9' ) {
138 				//we parse integer or decimal number
139 				++i1;
140 				bool dot = false;
141 
142 				while( i1 != i2 ) {
143 					if( *i1 >= '0' && *i1 <= '9' ) {
144 						//do nothing
145 					} else {
146 						//look for '.' in case of decimal number
147 						if( *i1 == '.' ) {
148 							//allow only one dot in such expression
149 							if( !dot )
150 								dot = true;
151 							else
152 								raise_exception(it, i2, "Multiple dots near decimal expression");
153 						} else
154 							break;
155 					}
156 					++i1;
157 				}
158 
159 				if( dot )
160 					return token( it, i1, TOKEN_DECIMAL );
161 				else
162 					return token( it, i1, TOKEN_INTEGER );
163 
164 			} else {
165 				//current character is between ':' and '@'
166 				//possible tokens at this point that we are interested in:
167 				// ; < = > <= >=
168 				//unused characters in this range:
169 				// : ? @
170 
171 				if( *i1 == ';' ) {
172 					return token( it, ++i1, TOKEN_SEMICOLON);
173 				} else if( *i1 == '=' ) {
174 					return token( it, ++i1, TOKEN_OPERATOR);
175 				} else if( *i1 == '<' ) {
176 					++i1;
177 					if( i1 != i2 ) {
178 						if( *i1 == '=' )
179 							return token( it, ++i1, TOKEN_OPERATOR);
180 						else
181 							return token( it, i1, TOKEN_OPERATOR);
182 					} else
183 						return token( it, i1, TOKEN_OPERATOR);
184 				} else if( *i1 == '>' ) {
185 					++i1;
186 					if( i1 != i2 ) {
187 						if( *i1 == '=' )
188 							return token( it, ++i1, TOKEN_OPERATOR);
189 						else
190 							return token( it, i1, TOKEN_OPERATOR);
191 					} else
192 						return token( it, i1, TOKEN_OPERATOR);
193 				}
194 			}
195 		//current character is between '!' and '/'
196 		//possible tokens:
197 		// , . .+ .- .* ./ .. ( ) ' # + - -> * / % !=
198 		//unused characters:
199 		// ! " $ &
200 		// ! is used only as part of !=
201 		// Note: " should never be used since it plays poorly with WML
202 		} else if ( *i1 == ',' ) {
203 			return token( it, ++i1, TOKEN_COMMA);
204 
205 		} else if ( *i1 == '.' ) {
206 			++i1;
207 
208 			if( i1 != i2 ) {
209 				if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/' || *i1 == '.')
210 					return token( it, ++i1, TOKEN_OPERATOR );
211 				else
212 					return token( it, i1, TOKEN_OPERATOR );
213 			} else {
214 				return token( it, i1, TOKEN_OPERATOR);
215 			}
216 
217 		} else if ( *i1 == '(' ) {
218 			return token( it, ++i1, TOKEN_LPARENS);
219 
220 		} else if ( *i1 == ')' ) {
221 			return token( it, ++i1, TOKEN_RPARENS);
222 
223 		} else if ( *i1 == '\'' ) {
224 			int bracket_depth = 0;
225 			++i1;
226 			while (i1 != i2) {
227 				if (*i1 == '[') {
228 					bracket_depth++;
229 				} else if(bracket_depth > 0 && *i1 == ']') {
230 					bracket_depth--;
231 				} else if(bracket_depth == 0 && *i1 == '\'') {
232 					break;
233 				}
234 				++i1;
235 			}
236 
237 			if( i1 != i2 ) {
238 				return token( it, ++i1, TOKEN_STRING_LITERAL );
239 			} else {
240 				raise_exception(it, i2, "Missing closing ' for formula string");
241 			}
242 
243 		} else if ( *i1 == '#' ) {
244 			++i1;
245 			while( i1 != i2 && *i1 != '#' )
246 				++i1;
247 
248 			if( i1 != i2 ) {
249 				return token( it, ++i1, TOKEN_COMMENT );
250 			} else {
251 				raise_exception(it, i2, "Missing closing # for formula comment");
252 			}
253 
254 		} else if ( *i1 == '+' ) {
255 			return token( it, ++i1, TOKEN_OPERATOR);
256 
257 		} else if ( *i1 == '-' ) {
258 			++i1;
259 
260 			if( i1 != i2 ) {
261 				if( *i1 == '>' )
262 					return token( it, ++i1, TOKEN_POINTER );
263 				else
264 					return token( it, i1, TOKEN_OPERATOR );
265 			} else {
266 				return token( it, i1, TOKEN_OPERATOR);
267 			}
268 
269 		} else if ( *i1 == '*' ) {
270 			return token( it, ++i1, TOKEN_OPERATOR);
271 
272 		} else if ( *i1 == '/' ) {
273 			return token( it, ++i1, TOKEN_OPERATOR);
274 
275 		} else if ( *i1 == '%' ) {
276 			return token( it, ++i1, TOKEN_OPERATOR);
277 
278 		} else if ( *i1 == '!' ) {
279 			++i1;
280 			if( *i1 == '=' )
281 				return token( it, ++i1, TOKEN_OPERATOR);
282 			else
283 				raise_exception(it, i2, std::string() );
284 		}
285 	}
286 	raise_exception(it, i2, std::string() );
287 	return token();
288 }
289 
290 }
291 
292 }
293