1 /*
2  * This file is part of the Yices SMT Solver.
3  * Copyright (C) 2017 SRI International.
4  *
5  * Yices is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Yices is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Yices.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 /*
20  * Lexer for Yices language
21  * - separators are ( ) : spaces, EOF, ; and "
22  *
23  * - strings are delimited by " with escaped char \n, \t, etc. allowed
24  *
25  * - two kinds of numeric literals are recognized
26  *     TK_NUM_RATIONAL:  <optional_sign><digits>/<digits>
27  *                    or <optional_sign><digits>
28  *     TK_NUM_FLOAT:
29  *       <optional_sign> <digits> . <digits>
30  *       <optional_sign> <digits> <exp> <optional_sign> <digits>
31  *       <optional_sign> <digits> . <digits> <exp> <optional_sign> <digits>
32  *
33  *   (the two formats recognized by string-to-rational conversions in rational.c)
34  *
35  * - bit-vector literals are written 0b<binary digits> (cf. bv_constants.c)
36  *   (added 5/10/07) can also be written 0x<hexa digits>
37  *
38  * - comments start with ; and extend to the end of the line
39  */
40 
41 #include <ctype.h>
42 #include <assert.h>
43 
44 /*
45  * yices_hash_keywords.h is generated by gperf
46  * from input file yices_keywords.txt
47  */
48 #include "frontend/yices/yices_hash_keywords.h"
49 #include "frontend/yices/yices_lexer.h"
50 
51 
52 /*
53  * All keywords
54  */
55 static keyword_t yices_keywords[] = {
56   // type keywords
57   { "bool", TK_BOOL },
58   { "int", TK_INT },
59   { "real", TK_REAL },
60   { "bitvector", TK_BITVECTOR },
61   { "scalar", TK_SCALAR },
62   { "tuple", TK_TUPLE },
63   { "->", TK_ARROW },
64 
65   // term keywords
66   { "true", TK_TRUE },
67   { "false", TK_FALSE },
68   { "if", TK_IF },
69   { "ite", TK_ITE },
70   { "=", TK_EQ },
71   { "/=", TK_DISEQ },
72   { "distinct", TK_DISTINCT },
73 
74   { "or", TK_OR },
75   { "and", TK_AND },
76   { "not", TK_NOT },
77   { "xor", TK_XOR },
78   { "<=>", TK_IFF },
79   { "=>", TK_IMPLIES },
80   { "mk-tuple", TK_MK_TUPLE },
81   { "select", TK_SELECT },
82   { "tuple-update", TK_UPDATE_TUPLE },
83   { "update", TK_UPDATE },
84   { "forall", TK_FORALL },
85   { "exists", TK_EXISTS },
86   { "lambda", TK_LAMBDA },
87 
88   // arithmetic keywords
89   { "+", TK_ADD },
90   { "-", TK_SUB },
91   { "*", TK_MUL },
92   { "/", TK_DIV },
93   { "^", TK_POW },
94   { "<", TK_LT },
95   { "<=", TK_LE },
96   { ">", TK_GT },
97   { ">=", TK_GE },
98 
99   // bitvector keywords
100   { "mk-bv", TK_MK_BV },
101   { "bv-add", TK_BV_ADD },
102   { "bv-sub", TK_BV_SUB },
103   { "bv-mul", TK_BV_MUL },
104   { "bv-neg", TK_BV_NEG },
105   { "bv-pow", TK_BV_POW },
106   { "bv-not", TK_BV_NOT },
107   { "bv-and", TK_BV_AND },
108   { "bv-or", TK_BV_OR },
109   { "bv-xor", TK_BV_XOR },
110   { "bv-nand", TK_BV_NAND },
111   { "bv-nor", TK_BV_NOR },
112   { "bv-xnor", TK_BV_XNOR },
113   { "bv-shift-left0", TK_BV_SHIFT_LEFT0 },
114   { "bv-shift-left1", TK_BV_SHIFT_LEFT1 },
115   { "bv-shift-right0", TK_BV_SHIFT_RIGHT0 },
116   { "bv-shift-right1", TK_BV_SHIFT_RIGHT1 },
117   { "bv-ashift-right", TK_BV_ASHIFT_RIGHT },
118   { "bv-rotate-left", TK_BV_ROTATE_LEFT },
119   { "bv-rotate-right", TK_BV_ROTATE_RIGHT },
120   { "bv-extract", TK_BV_EXTRACT },
121   { "bv-concat", TK_BV_CONCAT },
122   { "bv-repeat", TK_BV_REPEAT },
123   { "bv-sign-extend", TK_BV_SIGN_EXTEND },
124   { "bv-zero-extend", TK_BV_ZERO_EXTEND },
125   { "bv-ge", TK_BV_GE },
126   { "bv-gt", TK_BV_GT },
127   { "bv-le", TK_BV_LE },
128   { "bv-lt", TK_BV_LT },
129   { "bv-sge", TK_BV_SGE },
130   { "bv-sgt", TK_BV_SGT },
131   { "bv-sle", TK_BV_SLE },
132   { "bv-slt", TK_BV_SLT },
133 
134   // more bitvector keywords
135   { "bv-shl", TK_BV_SHL },
136   { "bv-lshr", TK_BV_LSHR },
137   { "bv-ashr", TK_BV_ASHR },
138   { "bv-div", TK_BV_DIV },
139   { "bv-rem", TK_BV_REM },
140   { "bv-sdiv", TK_BV_SDIV },
141   { "bv-srem", TK_BV_SREM },
142   { "bv-smod", TK_BV_SMOD },
143   { "bv-redor", TK_BV_REDOR },
144   { "bv-redand", TK_BV_REDAND },
145   { "bv-comp", TK_BV_COMP },
146 
147   // conversions between bitvectors and Booleans
148   { "bool-to-bv", TK_BOOL_TO_BV },
149   { "bit", TK_BIT},
150 
151   // more arithmetic functions
152   { "floor", TK_FLOOR },
153   { "ceil", TK_CEIL },
154   { "abs", TK_ABS },
155   { "div", TK_IDIV },
156   { "mod", TK_MOD },
157   { "divides", TK_DIVIDES },
158   { "is-int", TK_IS_INT },
159 
160   // other keywords
161   { "let", TK_LET },
162   { "define-type", TK_DEFINE_TYPE },
163   { "define", TK_DEFINE },
164   { "assert", TK_ASSERT },
165   { "check", TK_CHECK },
166   { "check-assuming", TK_CHECK_ASSUMING },
167   { "push", TK_PUSH },
168   { "pop", TK_POP },
169   { "reset", TK_RESET },
170   { "dump-context", TK_DUMP_CONTEXT },
171   { "exit", TK_EXIT },
172   { "echo", TK_ECHO },
173   { "include", TK_INCLUDE },
174   { "show-model", TK_SHOW_MODEL },
175   { "eval", TK_EVAL },
176   { "set-param", TK_SET_PARAM },
177   { "show-param", TK_SHOW_PARAM },
178   { "show-params", TK_SHOW_PARAMS },
179   { "show-stats", TK_SHOW_STATS },
180   { "reset-stats", TK_RESET_STATS },
181   { "set-timeout", TK_SET_TIMEOUT },
182   { "show-timeout", TK_SHOW_TIMEOUT },
183   { "help", TK_HELP },
184   { "ef-solve", TK_EF_SOLVE },
185   { "export-to-dimacs", TK_EXPORT_TO_DIMACS },
186   { "show-implicant", TK_SHOW_IMPLICANT },
187   { "show-unsat-core", TK_SHOW_UNSAT_CORE },
188   { "show-unsat-assumptions", TK_SHOW_UNSAT_ASSUMPTIONS },
189   { "show-reduced-model", TK_SHOW_REDUCED_MODEL },
190 
191   // end-marker
192   { NULL, 0 },
193 };
194 
195 /*
196  * name for each token
197  */
198 static char *token_string[NUM_YICES_TOKENS];
199 
200 
201 /*
202  * Initialize token2string table
203  */
init_token2string(void)204 static void init_token2string(void) {
205   keyword_t *kw;
206 
207   // keywords
208   kw = yices_keywords;
209   while (kw->word != NULL) {
210     token_string[kw->tk] = kw->word;
211     kw ++;
212   }
213 
214   // other tokens
215   token_string[TK_LP] = "(";
216   token_string[TK_RP] = ")";
217   token_string[TK_COLON_COLON] = "::";
218   token_string[TK_EOS] = "<end-of-stream>";
219   token_string[TK_STRING] = "<string>";
220   token_string[TK_NUM_RATIONAL] = "<rational>";
221   token_string[TK_NUM_FLOAT] = "<float>";
222   token_string[TK_BV_CONSTANT] = "<bv-constant>";
223   token_string[TK_HEX_CONSTANT] = "<hex-constant>";
224   token_string[TK_SYMBOL] = "<symbol>";
225 
226   token_string[TK_OPEN_STRING] = "<bad-string>";
227   token_string[TK_EMPTY_BVCONST] = "<bad-bvconst>";
228   token_string[TK_EMPTY_HEXCONST] = "<bad-hexconst>";
229   token_string[TK_INVALID_NUM] = "<bad-float>";
230   token_string[TK_ZERO_DIVISOR] = "<zero-divisor-in-rational>";
231   token_string[TK_ERROR] = "<error>";
232 }
233 
234 
235 /*
236  * Lexer initialization
237  */
init_yices_file_lexer(lexer_t * lex,char * filename)238 int32_t init_yices_file_lexer(lexer_t *lex, char *filename) {
239   init_token2string();
240   return init_file_lexer(lex, filename);
241 }
242 
init_yices_stream_lexer(lexer_t * lex,FILE * f,char * name)243 void init_yices_stream_lexer(lexer_t *lex, FILE *f, char *name) {
244   init_token2string();
245   init_stream_lexer(lex, f, name);
246 }
247 
init_yices_string_lexer(lexer_t * lex,char * data,char * name)248 void init_yices_string_lexer(lexer_t *lex, char *data, char *name) {
249   init_token2string();
250   init_string_lexer(lex, data, name);
251 }
252 
253 
254 
255 /*
256  * Get token_string
257  */
yices_token_to_string(yices_token_t tk)258 char *yices_token_to_string(yices_token_t tk) {
259   assert(0 <= tk && tk < NUM_YICES_TOKENS);
260   return token_string[tk];
261 }
262 
263 /*
264  * Check whether c is a separator character (cannot be part of a symbol)
265  */
is_yices_sep(int c)266 static bool is_yices_sep(int c) {
267   return isspace(c) || c == EOF || c == '(' || c == ')' ||
268     c == ':' || c == ';' || c == '"';
269 }
270 
271 
272 /*
273  * Read a string literal:
274  * - lex->current_char == '"' and lex->buffer is empty
275  */
read_string(lexer_t * lex)276 static yices_token_t read_string(lexer_t *lex) {
277   yices_token_t tk;
278   int c, x;
279   reader_t *rd;
280   string_buffer_t *buffer;
281 
282   rd = &lex->reader;
283   buffer = lex->buffer;
284   assert(reader_current_char(rd) == '"');
285 
286   c = reader_next_char(rd);
287 
288   for (;;) {
289     if (c == '"') { // end of string
290       // consume the closing quote
291       reader_next_char(rd);
292       tk = TK_STRING;
293       break;
294     }
295     if (c == '\n' || c == EOF) { // missing quotes
296       tk = TK_OPEN_STRING;
297       break;
298     }
299     if (c == '\\') {
300       // escape sequence
301       c = reader_next_char(rd);
302       switch (c) {
303       case 'n': c = '\n'; break;
304       case 't': c = '\t'; break;
305       default:
306         if ('0' <= c && c <= '7') {
307           // read at most 2 more octal digits
308           x = c - '0';
309           c = reader_next_char(rd);
310           if ('0' <= c && c <= '7') {
311             x = 8 * x + (c - '0');
312             c = reader_next_char(rd);
313             if ('0' <= c && c <= '7') {
314               x = 8 * x + (c - '0');
315               c = reader_next_char(rd);
316             }
317           }
318           // x = character built from the octal digits
319           // c = character after octal digit
320           string_buffer_append_char(buffer, x);
321           continue;
322         } // else skip '\': copy c in the buffer
323         break;
324       }
325     }
326     string_buffer_append_char(buffer, c);
327     c = reader_next_char(rd);
328   }
329 
330   string_buffer_close(buffer);
331   return tk;
332 }
333 
334 /*
335  * Read a bitvector constant:
336  * lex->current_char = 'b' and lex->buffer contains "0"
337  */
read_bv_constant(lexer_t * lex)338 static yices_token_t read_bv_constant(lexer_t *lex) {
339   reader_t *rd;
340   string_buffer_t *buffer;
341   int c;
342 
343   rd = &lex->reader;
344   c = reader_current_char(rd);
345   buffer = lex->buffer;
346 
347   do {
348     string_buffer_append_char(buffer, c);
349     c = reader_next_char(rd);
350   } while (c == '0' || c == '1');
351   string_buffer_close(buffer);
352 
353   if (string_buffer_length(buffer) <= 2) {
354     return TK_EMPTY_BVCONST; // empty constant
355   } else {
356     return TK_BV_CONSTANT;
357   }
358 }
359 
360 /*
361  * Read a hexadecimal constant:
362  * lex->current_char = 'x' and lex->buffer contains "0"
363  */
read_hex_constant(lexer_t * lex)364 static yices_token_t read_hex_constant(lexer_t *lex) {
365   reader_t *rd;
366   string_buffer_t *buffer;
367   int c;
368 
369   rd = &lex->reader;
370   c = reader_current_char(rd);
371   buffer = lex->buffer;
372 
373   do {
374     string_buffer_append_char(buffer, c);
375     c = reader_next_char(rd);
376   } while (isxdigit(c));
377   string_buffer_close(buffer);
378 
379   if (string_buffer_length(buffer) <= 2) {
380     return TK_EMPTY_HEXCONST; // empty constant
381   } else {
382     return TK_HEX_CONSTANT;
383   }
384 }
385 
386 /*
387  * Read a symbol or keyword
388  * lex->buffer contains one char (not a separator or digit)
389  * char = next character after that.
390  */
read_symbol(lexer_t * lex)391 static yices_token_t read_symbol(lexer_t *lex) {
392   reader_t *rd;
393   string_buffer_t *buffer;
394   int c;
395   token_t tk;
396   const keyword_t *kw;
397 
398   rd = &lex->reader;
399   c = reader_current_char(rd);
400   buffer = lex->buffer;
401 
402   while (! is_yices_sep(c)) {
403     string_buffer_append_char(buffer, c);
404     c = reader_next_char(rd);
405   }
406 
407   string_buffer_close(buffer);
408 
409   tk = TK_SYMBOL;
410   kw = in_yices_kw(buffer->data, buffer->index);
411   if (kw != NULL) {
412     tk = kw->tk;
413   }
414 
415   return tk;
416 }
417 
418 /*
419  * Read a number
420  * lex->buffer contains <optional_sign> and a single digit
421  * current_char = what's after the digit in buffer.
422  */
read_number(lexer_t * lex)423 static yices_token_t read_number(lexer_t *lex) {
424   reader_t *rd;
425   string_buffer_t *buffer;
426   int c, all_zeros;
427   yices_token_t tk;
428 
429   rd = &lex->reader;
430   c = reader_current_char(rd);
431   buffer = lex->buffer;
432   tk = TK_NUM_RATIONAL; // default
433 
434   while (isdigit(c)) {
435     string_buffer_append_char(buffer, c);
436     c = reader_next_char(rd);
437   }
438 
439   if (c == '/') {
440     // denominator
441     string_buffer_append_char(buffer, c);
442     c = reader_next_char(rd);
443     if (! isdigit(c)) {
444       tk = TK_INVALID_NUM;
445       goto done;
446     }
447     all_zeros = true;
448     do {
449       if (c != '0') all_zeros = false;
450       string_buffer_append_char(buffer, c);
451       c = reader_next_char(rd);
452     } while (isdigit(c));
453 
454     if (all_zeros) tk = TK_ZERO_DIVISOR;
455     // else tk = TK_NUM_RATIONAL
456     goto done;
457   }
458 
459   if (c == '.') {
460     tk = TK_NUM_FLOAT;
461     // fractional part
462     string_buffer_append_char(buffer, c);
463     c = reader_next_char(rd);
464     if (! isdigit(c)) {
465       tk = TK_INVALID_NUM;
466       goto done;
467     }
468     do {
469       string_buffer_append_char(buffer, c);
470       c = reader_next_char(rd);
471     } while (isdigit(c));
472   }
473 
474   if (c == 'e' || c == 'E') {
475     tk = TK_NUM_FLOAT;
476     // exponent
477     string_buffer_append_char(buffer, c);
478     c = reader_next_char(rd);
479     if (c == '+' || c == '-') {
480       string_buffer_append_char(buffer, c);
481       c = reader_next_char(rd);
482     }
483     if (! isdigit(c)) {
484       tk = TK_INVALID_NUM;
485       goto done;
486     }
487     do {
488       string_buffer_append_char(buffer, c);
489       c = reader_next_char(rd);
490     } while (isdigit(c));
491   }
492 
493  done:
494   string_buffer_close(buffer);
495   return tk;
496 }
497 
498 /*
499  * Read next token and return its type tk
500  * - set lex->token to tk
501  * - set lex->tk_pos, etc.
502  * - if token is TK_STRING, TK_NUM_RATIONAL, TK_NUM_FLOAT, TK_BV_CONSTANT, TK_SYMBOL, TK_ERROR,
503  *   the token value is stored in lex->buffer (as a string).
504  */
next_yices_token(lexer_t * lex)505 yices_token_t next_yices_token(lexer_t *lex) {
506   yices_token_t tk;
507   reader_t *rd;
508   string_buffer_t *buffer;
509   int c;
510 
511   rd = &lex->reader;
512   c = reader_current_char(rd);
513   buffer = lex->buffer;
514   string_buffer_reset(buffer);
515 
516   // skip spaces and comments
517   for (;;) {
518     while (isspace(c)) c = reader_next_char(rd);
519     if (c != ';') break;
520     do { // read to end-of-line or eof
521       c = reader_next_char(rd);
522     } while (c != '\n' && c != EOF);
523   }
524 
525   // record token position (start of token)
526   lex->tk_pos = rd->pos;
527   lex->tk_line = rd->line;
528   lex->tk_column = rd->column;
529 
530   switch (c) {
531   case '(':
532     tk = TK_LP;
533     goto next_then_return;
534   case ')':
535     tk = TK_RP;
536     goto next_then_return;
537   case EOF:
538     tk = TK_EOS;
539     goto done;
540   case ':':
541     c = reader_next_char(rd);
542     if (c == ':') {
543       tk = TK_COLON_COLON;
544       goto next_then_return;
545     } else {
546       // store ':' in the buffer since that may be used for reporting errors
547       string_buffer_append_char(buffer, ':');
548       string_buffer_close(buffer);
549       tk = TK_ERROR;
550       goto done;
551     }
552   case '"':
553     tk = read_string(lex);
554     goto done;
555   case '+':
556   case '-':
557     string_buffer_append_char(buffer, c);
558     c = reader_next_char(rd);
559     if (isdigit(c)) {
560       string_buffer_append_char(buffer, c);
561       reader_next_char(rd);
562       tk = read_number(lex);
563     } else {
564       tk = read_symbol(lex);
565     }
566     goto done;
567 
568   case '0':
569     string_buffer_append_char(buffer, c);
570     c = reader_next_char(rd);
571     if (c == 'b') {
572       tk = read_bv_constant(lex);
573     } else if (c == 'x') {
574       tk = read_hex_constant(lex);
575     } else {
576       tk = read_number(lex);
577     }
578     goto done;
579 
580   case '1':
581   case '2':
582   case '3':
583   case '4':
584   case '5':
585   case '6':
586   case '7':
587   case '8':
588   case '9':
589     string_buffer_append_char(buffer, c);
590     reader_next_char(rd);
591     tk = read_number(lex);
592     goto done;
593 
594   default: // symbol or keyword
595     string_buffer_append_char(buffer, c);
596     reader_next_char(rd);
597     tk = read_symbol(lex);
598     goto done;
599   }
600 
601   /*
602    * read next character and exit
603    */
604  next_then_return:
605   reader_next_char(rd);
606 
607 
608  done:
609   lex->token = tk;
610   return tk;
611 }
612 
613