1 /*
2    Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
23 
24 #include <limits.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 /* We only need the tokens here */
30 #define YYSTYPE_IS_DECLARED
31 
32 #include "sql/lex.h"
33 #include "sql/lex_symbol.h"
34 #include "sql/sql_yacc.h"
35 #include "welcome_copyright_notice.h" /* ORACLE_WELCOME_COPYRIGHT_NOTICE */
36 
37 /*
38   MAINTAINER:
39 
40   Tokens printed in sql/lex_token.h do come from several sources:
41   - tokens from sql_yacc.yy
42   - tokens from sql_hints.yy
43   - fake tokens for digests.
44 
45   All the token values are mapped in the same space,
46   indexed by the token value directly.
47 
48   To account for enhancements and new tokens,
49   gap are created, so that adding a token from one source
50   does not change values of tokens from other sources.
51 
52   This is done to ensure stability in digest computed values.
53 
54   As of now (8.0.0), the mapping looks like this:
55   - PART 1: [0 .. 255] tokens of single-character lexemes
56   - PART 2: [256 .. ...] tokens < YYUNDEF from sql_yacc.yy
57   - PART 3: [... .. 999] reserved for sql_yacc.yy new tokens < YYUNDEF
58   - PART 4: [1000 .. ...] tokens from sql_hints.yy
59   - PART 5: [... .. 1099] reserved for sql_hints.yy new tokens
60   - PART 6: [1100 .. ...] digest special fake tokens
61   - PART 7: [... .. 1149] reserved for new digest special fake tokens
62   - PART 8: [1150 .. ...] tokens > YYUNDEF from sql_yacc.yy
63 
64   Should gen_lex_token fail when tokens are exhausted
65   (maybe you are reading this comment because of a fprintf(stderr) below),
66   the options are as follows, by order of decreasing desirability:
67 
68   1) Reuse OBSOLETE_TOKEN_XXX instead of consuming new token values
69 
70   2) Consider if you really need to create a new token,
71   instead of re using an existing one.
72 
73   Keep in mind that syntax sugar in the parser still adds
74   to complexity, by making the parser tables bigger,
75   so adding tokens all the time is not a good practice.
76 
77   3) Expand boundary values for
78      - range_for_sql_hints
79      - range_for_digests
80   and record again all the MTR tests that print a DIGEST,
81   because DIGEST values have now changed.
82 
83   While at it, because digests have changed anyway,
84   please seriously consider to clean up and reorder:
85   - all the tokens in sql/sql_yacc.yy in one nice list,
86   ordered alphabetically, removing obsolete values if any.
87   - likewise for sql/sql_hints.yy
88 */
89 
90 /** Generated token. */
91 struct gen_lex_token_string {
gen_lex_token_stringgen_lex_token_string92   gen_lex_token_string(const char *token_string, int token_length,
93                        bool append_space, bool start_expr)
94       : m_token_string{token_string},
95         m_token_length{token_length},
96         m_append_space{append_space},
97         m_start_expr{start_expr} {}
98 
gen_lex_token_stringgen_lex_token_string99   gen_lex_token_string()
100       : m_token_string{},
101         m_token_length{},
102         m_append_space{true},
103         m_start_expr{false} {}
104 
105   /**
106     Real lexeme string or user-specified text to output with a normalized
107     query string.
108   */
109   const char *m_token_string;
110 
111   /**
112     Byte length of m_token_string.
113   */
114   int m_token_length;
115 
116   /**
117     If true, output ' ' after this token to a normalized query string.
118     See digest_add_token().
119   */
120   bool m_append_space;
121 
122   /**
123     See digest_add_token().
124   */
125   bool m_start_expr;
126 
127   /**
128     The structure is uninitialized if false.
129   */
130   bool m_initialized{false};
131 };
132 
133 /*
134   This is a tool used during build only,
135   so MY_MAX_TOKEN does not need to be exact,
136   only big enough to hold:
137   - 256 of single-character lexeme tokens
138   - up to 1000 named tokens from bison (sql_yacc.yy).
139   - padding
140   - tokens from bison (sql_hints.yy).
141   - padding
142   - DIGEST special tokens.
143   - padding
144   - mode named tokens from bison (sql_yacc.yy).
145   See also YYMAXUTOK.
146 */
147 const int MY_MAX_TOKEN = 1200;
148 
149 gen_lex_token_string compiled_token_array[MY_MAX_TOKEN];
150 
151 struct range {
rangerange152   range(const char *title, int start, int end)
153       : title{title}, start{start}, end{end}, max_seen{0} {}
154 
set_tokenrange155   void set_token(int tok, const char *str, int line) {
156     if (tok <= 0) {
157       fprintf(stderr, "%s:%d: Bad token found\n", __FILE__, line);
158       exit(1);
159     }
160 
161     if (tok > end) {
162       fprintf(stderr,
163               "%s:%d: Token reserve for %s exhausted: %d (should be <= %d).\n"
164               "Please see MAINTAINER instructions in sql/gen_lex_token.cc\n",
165               __FILE__, line, title, tok, end);
166       exit(1);
167     }
168 
169     if (tok >= MY_MAX_TOKEN) {
170       fprintf(stderr,
171               "%s:%d: Added that many new keywords ? Increase MY_MAX_TOKEN\n",
172               __FILE__, line);
173       exit(1);
174     }
175 
176     if (tok > max_seen) {
177       max_seen = tok;
178     }
179 
180     compiled_token_array[tok].m_initialized = true;
181     compiled_token_array[tok].m_token_string = str;
182     compiled_token_array[tok].m_token_length = strlen(str);
183     compiled_token_array[tok].m_append_space = true;
184     compiled_token_array[tok].m_start_expr = false;
185   }
186 
add_tokenrange187   int add_token(const char *str, int line) {
188     set_token(max_seen ? max_seen + 1 : start, str, line);
189     return max_seen;
190   }
191 
printrange192   void print(const char *header1, const char *header2 = nullptr) const {
193     puts(header1);
194     for (int tok = start; tok <= max_seen; tok++) {
195       print_token(tok);
196     }
197 
198     if (header2 == nullptr) {
199       return;
200     }
201 
202     puts(header2);
203     for (int tok = max_seen + 1; tok <= end; tok++) {
204       printf("/* reserved %03d for %s */  { \"\", 0, false, false},\n", tok,
205              title);
206     }
207   }
208 
209  private:
print_tokenrange210   void print_token(int tok) const {
211     const gen_lex_token_string *x = &compiled_token_array[tok];
212     if (tok < 256) {
213       printf("/* %03d */  { \"\\x%02x\", 1, %s, %s},\n", tok, tok,
214              x->m_append_space ? "true" : "false",
215              x->m_start_expr ? "true" : "false");
216       return;
217     }
218 
219     if (!x->m_initialized) {
220       static const gen_lex_token_string dummy{"(unknown)", 9, true, false};
221       x = &dummy;
222     }
223     printf("/* %03d */  { \"%s\", %d, %s, %s},\n", tok, x->m_token_string,
224            x->m_token_length, x->m_append_space ? "true" : "false",
225            x->m_start_expr ? "true" : "false");
226   }
227 
228  private:
229   const char *const title;
230 
231  public:
232   const int start;
233 
234  private:
235   const int end;
236 
237   int max_seen;
238 };
239 
240 static_assert(YYUNDEF == 1150,
241               "YYUNDEF must be stable, because raw token numbers are used in "
242               "PFS digest calculations");
243 range range_for_sql_yacc2{"sql/sql_yacc.yy (before YYUNDEF)", YYUNDEF,
244                           MY_MAX_TOKEN};
245 
246 range range_for_digests{"digest specials", 1100, range_for_sql_yacc2.start - 1};
247 
248 static_assert(MAX_EXECUTION_TIME_HINT == 1000,
249               "MAX_EXECUTION_TIME_HINT should be equal to 1000");
250 range range_for_sql_hints{"sql/sql_hints.yy", MAX_EXECUTION_TIME_HINT,
251                           range_for_digests.start - 1};
252 
253 range range_for_sql_yacc1{"sql/sql_yacc.yy (after YYUNDEF)", 256,
254                           range_for_sql_hints.start - 1};
255 
256 int tok_generic_value = 0;
257 int tok_generic_value_list = 0;
258 int tok_row_single_value = 0;
259 int tok_row_single_value_list = 0;
260 int tok_row_multiple_value = 0;
261 int tok_row_multiple_value_list = 0;
262 int tok_in_generic_value_expression = 0;
263 int tok_ident = 0;
264 int tok_ident_at = 0;  ///< Fake token for the left part of table\@query_block.
265 int tok_hint_comment_open =
266     0;  ///< Fake token value for "/*+" of hint comments.
267 int tok_hint_comment_close =
268     0;  ///< Fake token value for "*/" of hint comments.
269 int tok_unused = 0;
270 
set_start_expr_token(int tok)271 static void set_start_expr_token(int tok) {
272   compiled_token_array[tok].m_start_expr = true;
273 }
274 
compute_tokens()275 static void compute_tokens() {
276   /*
277     Tokens made of just one terminal character
278   */
279 
280   // Do nothing -- see range::print() for token numbers in [0 .. 255]
281 
282   /*
283     Tokens hard coded in sql_lex.cc
284   */
285 
286   range_for_sql_yacc1.set_token(WITH_ROLLUP_SYM, "WITH ROLLUP", __LINE__);
287   range_for_sql_yacc1.set_token(NOT2_SYM, "!", __LINE__);
288   range_for_sql_yacc1.set_token(OR2_SYM, "||", __LINE__);
289   range_for_sql_yacc1.set_token(PARAM_MARKER, "?", __LINE__);
290   range_for_sql_yacc1.set_token(SET_VAR, ":=", __LINE__);
291   range_for_sql_yacc1.set_token(UNDERSCORE_CHARSET, "(_charset)", __LINE__);
292   range_for_sql_yacc1.set_token(END_OF_INPUT, "", __LINE__);
293   range_for_sql_yacc1.set_token(JSON_SEPARATOR_SYM, "->", __LINE__);
294   range_for_sql_yacc1.set_token(JSON_UNQUOTED_SEPARATOR_SYM, "->>", __LINE__);
295 
296   /*
297     Values.
298     These tokens are all normalized later,
299     so this strings will never be displayed.
300   */
301   range_for_sql_yacc1.set_token(BIN_NUM, "(bin)", __LINE__);
302   range_for_sql_yacc1.set_token(DECIMAL_NUM, "(decimal)", __LINE__);
303   range_for_sql_yacc1.set_token(FLOAT_NUM, "(float)", __LINE__);
304   range_for_sql_yacc1.set_token(HEX_NUM, "(hex)", __LINE__);
305   range_for_sql_yacc1.set_token(LEX_HOSTNAME, "(hostname)", __LINE__);
306   range_for_sql_yacc1.set_token(LONG_NUM, "(long)", __LINE__);
307   range_for_sql_yacc1.set_token(NUM, "(num)", __LINE__);
308   range_for_sql_yacc1.set_token(TEXT_STRING, "(text)", __LINE__);
309   range_for_sql_yacc1.set_token(NCHAR_STRING, "(nchar)", __LINE__);
310   range_for_sql_yacc1.set_token(ULONGLONG_NUM, "(ulonglong)", __LINE__);
311 
312   /*
313     Identifiers.
314   */
315   range_for_sql_yacc1.set_token(IDENT, "(id)", __LINE__);
316   range_for_sql_yacc1.set_token(IDENT_QUOTED, "(id_quoted)", __LINE__);
317 
318   /*
319     See symbols[] in sql/lex.h
320   */
321   for (const SYMBOL &sym : symbols) {
322     if ((sym.group & SG_MAIN_PARSER) != 0) {
323       if (sym.tok < YYUNDEF)
324         range_for_sql_yacc1.set_token(sym.tok, sym.name, __LINE__);
325       else
326         range_for_sql_yacc2.set_token(sym.tok, sym.name, __LINE__);
327     } else if ((sym.group & SG_HINTS) != 0) {
328       range_for_sql_hints.set_token(sym.tok, sym.name, __LINE__);
329     } else {
330       fprintf(stderr, "%s:%d: Unknown symbol group flag: %x\n", __FILE__,
331               __LINE__, sym.group & ~(SG_MAIN_PARSER | SG_HINTS));
332       exit(1);
333     }
334   }
335 
336   /*
337     Additional FAKE tokens,
338     used internally to normalize a digest text.
339   */
340 
341   /* Digest tokens in 5.7 */
342 
343   tok_generic_value = range_for_digests.add_token("?", __LINE__);
344   tok_generic_value_list = range_for_digests.add_token("?, ...", __LINE__);
345   tok_row_single_value = range_for_digests.add_token("(?)", __LINE__);
346   tok_row_single_value_list =
347       range_for_digests.add_token("(?) /* , ... */", __LINE__);
348   tok_row_multiple_value = range_for_digests.add_token("(...)", __LINE__);
349   tok_row_multiple_value_list =
350       range_for_digests.add_token("(...) /* , ... */", __LINE__);
351   tok_ident = range_for_digests.add_token("(tok_id)", __LINE__);
352   tok_ident_at = range_for_digests.add_token("(tok_id_at)", __LINE__);
353   tok_hint_comment_open =
354       range_for_digests.add_token(HINT_COMMENT_STARTER, __LINE__);
355   tok_hint_comment_close =
356       range_for_digests.add_token(HINT_COMMENT_TERMINATOR, __LINE__);
357 
358   /* New in 8.0 */
359 
360   tok_in_generic_value_expression =
361       range_for_digests.add_token("IN (...)", __LINE__);
362 
363   /* Add new digest tokens here */
364 
365   tok_unused = range_for_digests.add_token("UNUSED", __LINE__);
366 
367   /*
368     Fix whitespace for some special tokens.
369   */
370 
371   /*
372     The lexer parses "@@variable" as '@', '@', 'variable',
373     returning a token for '@' alone.
374 
375     This is incorrect, '@' is not really a token,
376     because the syntax "@ @ variable" (with spaces) is not accepted:
377     The lexer keeps some internal state after the '@' fake token.
378 
379     To work around this, digest text are printed as "@@variable".
380   */
381   compiled_token_array[(int)'@'].m_append_space = false;
382 
383   /*
384     Define additional properties for tokens.
385 
386     List all the token that are followed by an expression.
387     This is needed to differentiate unary from binary
388     '+' and '-' operators, because we want to:
389     - reduce <unary +> <NUM> to <?>,
390     - preserve <...> <binary +> <NUM> as is.
391   */
392   set_start_expr_token('(');
393   set_start_expr_token(',');
394   set_start_expr_token(EVERY_SYM);
395   set_start_expr_token(AT_SYM);
396   set_start_expr_token(STARTS_SYM);
397   set_start_expr_token(ENDS_SYM);
398   set_start_expr_token(DEFAULT_SYM);
399   set_start_expr_token(RETURN_SYM);
400   set_start_expr_token(IF);
401   set_start_expr_token(ELSEIF_SYM);
402   set_start_expr_token(CASE_SYM);
403   set_start_expr_token(WHEN_SYM);
404   set_start_expr_token(WHILE_SYM);
405   set_start_expr_token(UNTIL_SYM);
406   set_start_expr_token(SELECT_SYM);
407 
408   set_start_expr_token(OR_SYM);
409   set_start_expr_token(OR2_SYM);
410   set_start_expr_token(XOR);
411   set_start_expr_token(AND_SYM);
412   set_start_expr_token(AND_AND_SYM);
413   set_start_expr_token(NOT_SYM);
414   set_start_expr_token(BETWEEN_SYM);
415   set_start_expr_token(LIKE);
416   set_start_expr_token(REGEXP);
417 
418   set_start_expr_token('|');
419   set_start_expr_token('&');
420   set_start_expr_token(SHIFT_LEFT);
421   set_start_expr_token(SHIFT_RIGHT);
422   set_start_expr_token('+');
423   set_start_expr_token('-');
424   set_start_expr_token(INTERVAL_SYM);
425   set_start_expr_token('*');
426   set_start_expr_token('/');
427   set_start_expr_token('%');
428   set_start_expr_token(DIV_SYM);
429   set_start_expr_token(MOD_SYM);
430   set_start_expr_token('^');
431 }
432 
print_tokens()433 static void print_tokens() {
434   int tok;
435 
436   printf("#ifdef LEX_TOKEN_WITH_DEFINITION\n");
437   printf("lex_token_string lex_token_array[]=\n");
438   printf("{\n");
439   printf("/* PART 1: character tokens. */\n");
440 
441   for (tok = 0; tok < 256; tok++) {
442     printf("/* %03d */  { \"\\x%02x\", 1, %s, %s},\n", tok, tok,
443            compiled_token_array[tok].m_append_space ? "true" : "false",
444            compiled_token_array[tok].m_start_expr ? "true" : "false");
445   }
446 
447   range_for_sql_yacc1.print(
448       "/* PART 2: named tokens from sql/sql_yacc.yy (chunk 1). */",
449       "/* PART 3: padding reserved for sql/sql_yacc.yy extensions. */");
450 
451   range_for_sql_hints.print(
452       "/* PART 4: named tokens from sql/sql_hints.yy. */",
453       "/* PART 5: padding reserved for sql/sql_hints.yy extensions. */");
454 
455   range_for_digests.print(
456       "/* PART 6: Digest special tokens. */",
457       "/* PART 7: padding reserved for digest special tokens. */");
458 
459   range_for_sql_yacc2.print(
460       "/* PART 8: named tokens from sql/sql_yacc.yy (chunk 2). */");
461 
462   printf("/* PART 9: End of token list. */\n");
463 
464   printf("/* DUMMY */ { \"\", 0, false, false}\n");
465   printf("};\n");
466   printf("#endif /* LEX_TOKEN_WITH_DEFINITION */\n");
467 
468   printf("/* DIGEST specific tokens. */\n");
469   printf("#define TOK_GENERIC_VALUE %d\n", tok_generic_value);
470   printf("#define TOK_GENERIC_VALUE_LIST %d\n", tok_generic_value_list);
471   printf("#define TOK_ROW_SINGLE_VALUE %d\n", tok_row_single_value);
472   printf("#define TOK_ROW_SINGLE_VALUE_LIST %d\n", tok_row_single_value_list);
473   printf("#define TOK_ROW_MULTIPLE_VALUE %d\n", tok_row_multiple_value);
474   printf("#define TOK_ROW_MULTIPLE_VALUE_LIST %d\n",
475          tok_row_multiple_value_list);
476   printf("#define TOK_IDENT %d\n", tok_ident);
477   printf("#define TOK_IDENT_AT %d\n", tok_ident_at);
478   printf("#define TOK_HINT_COMMENT_OPEN %d\n", tok_hint_comment_open);
479   printf("#define TOK_HINT_COMMENT_CLOSE %d\n", tok_hint_comment_close);
480   printf("#define TOK_IN_GENERIC_VALUE_EXPRESSION %d\n",
481          tok_in_generic_value_expression);
482   printf("#define TOK_UNUSED %d\n", tok_unused);
483 }
484 
485 /*
486   ZEROFILL_SYM is the last token in the MySQL 5.7 token list,
487   see sql/sql_yacc.yy
488   The token value is frozen and should not change,
489   to avoid changing query digest values.
490 */
491 static const int zerofill_expected_value = 906;
492 
493 static_assert(!(ZEROFILL_SYM < zerofill_expected_value),
494               "Token deleted. "
495               "Please read MAINTAINER instructions in sql/sql_yacc.yy");
496 static_assert(!(ZEROFILL_SYM > zerofill_expected_value),
497               "Token added in the wrong place. "
498               "Please read MAINTAINER instructions in sql/sql_yacc.yy");
499 
main(int,char **)500 int main(int, char **) {
501   puts(ORACLE_GPL_COPYRIGHT_NOTICE("2016"));
502 
503   printf("/*\n");
504   printf("  This file is generated, do not edit.\n");
505   printf("  See file sql/gen_lex_token.cc.\n");
506   printf("*/\n");
507   printf("struct lex_token_string\n");
508   printf("{\n");
509   printf("  const char *m_token_string;\n");
510   printf("  int m_token_length;\n");
511   printf("  bool m_append_space;\n");
512   printf("  bool m_start_expr;\n");
513   printf("};\n");
514   printf("typedef struct lex_token_string lex_token_string;\n");
515 
516   compute_tokens();
517   print_tokens();
518 
519   return 0;
520 }
521