1 /*
2 Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24 #include <limits.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 /* We only need the tokens here */
30 #define YYSTYPE_IS_DECLARED
31
32 #include "sql/lex.h"
33 #include "sql/lex_symbol.h"
34 #include "sql/sql_yacc.h"
35 #include "welcome_copyright_notice.h" /* ORACLE_WELCOME_COPYRIGHT_NOTICE */
36
37 /*
38 MAINTAINER:
39
40 Tokens printed in sql/lex_token.h do come from several sources:
41 - tokens from sql_yacc.yy
42 - tokens from sql_hints.yy
43 - fake tokens for digests.
44
45 All the token values are mapped in the same space,
46 indexed by the token value directly.
47
48 To account for enhancements and new tokens,
49 gap are created, so that adding a token from one source
50 does not change values of tokens from other sources.
51
52 This is done to ensure stability in digest computed values.
53
54 As of now (8.0.0), the mapping looks like this:
55 - PART 1: [0 .. 255] tokens of single-character lexemes
56 - PART 2: [256 .. ...] tokens < YYUNDEF from sql_yacc.yy
57 - PART 3: [... .. 999] reserved for sql_yacc.yy new tokens < YYUNDEF
58 - PART 4: [1000 .. ...] tokens from sql_hints.yy
59 - PART 5: [... .. 1099] reserved for sql_hints.yy new tokens
60 - PART 6: [1100 .. ...] digest special fake tokens
61 - PART 7: [... .. 1149] reserved for new digest special fake tokens
62 - PART 8: [1150 .. ...] tokens > YYUNDEF from sql_yacc.yy
63
64 Should gen_lex_token fail when tokens are exhausted
65 (maybe you are reading this comment because of a fprintf(stderr) below),
66 the options are as follows, by order of decreasing desirability:
67
68 1) Reuse OBSOLETE_TOKEN_XXX instead of consuming new token values
69
70 2) Consider if you really need to create a new token,
71 instead of re using an existing one.
72
73 Keep in mind that syntax sugar in the parser still adds
74 to complexity, by making the parser tables bigger,
75 so adding tokens all the time is not a good practice.
76
77 3) Expand boundary values for
78 - range_for_sql_hints
79 - range_for_digests
80 and record again all the MTR tests that print a DIGEST,
81 because DIGEST values have now changed.
82
83 While at it, because digests have changed anyway,
84 please seriously consider to clean up and reorder:
85 - all the tokens in sql/sql_yacc.yy in one nice list,
86 ordered alphabetically, removing obsolete values if any.
87 - likewise for sql/sql_hints.yy
88 */
89
90 /** Generated token. */
91 struct gen_lex_token_string {
gen_lex_token_stringgen_lex_token_string92 gen_lex_token_string(const char *token_string, int token_length,
93 bool append_space, bool start_expr)
94 : m_token_string{token_string},
95 m_token_length{token_length},
96 m_append_space{append_space},
97 m_start_expr{start_expr} {}
98
gen_lex_token_stringgen_lex_token_string99 gen_lex_token_string()
100 : m_token_string{},
101 m_token_length{},
102 m_append_space{true},
103 m_start_expr{false} {}
104
105 /**
106 Real lexeme string or user-specified text to output with a normalized
107 query string.
108 */
109 const char *m_token_string;
110
111 /**
112 Byte length of m_token_string.
113 */
114 int m_token_length;
115
116 /**
117 If true, output ' ' after this token to a normalized query string.
118 See digest_add_token().
119 */
120 bool m_append_space;
121
122 /**
123 See digest_add_token().
124 */
125 bool m_start_expr;
126
127 /**
128 The structure is uninitialized if false.
129 */
130 bool m_initialized{false};
131 };
132
133 /*
134 This is a tool used during build only,
135 so MY_MAX_TOKEN does not need to be exact,
136 only big enough to hold:
137 - 256 of single-character lexeme tokens
138 - up to 1000 named tokens from bison (sql_yacc.yy).
139 - padding
140 - tokens from bison (sql_hints.yy).
141 - padding
142 - DIGEST special tokens.
143 - padding
144 - mode named tokens from bison (sql_yacc.yy).
145 See also YYMAXUTOK.
146 */
147 const int MY_MAX_TOKEN = 1200;
148
149 gen_lex_token_string compiled_token_array[MY_MAX_TOKEN];
150
151 struct range {
rangerange152 range(const char *title, int start, int end)
153 : title{title}, start{start}, end{end}, max_seen{0} {}
154
set_tokenrange155 void set_token(int tok, const char *str, int line) {
156 if (tok <= 0) {
157 fprintf(stderr, "%s:%d: Bad token found\n", __FILE__, line);
158 exit(1);
159 }
160
161 if (tok > end) {
162 fprintf(stderr,
163 "%s:%d: Token reserve for %s exhausted: %d (should be <= %d).\n"
164 "Please see MAINTAINER instructions in sql/gen_lex_token.cc\n",
165 __FILE__, line, title, tok, end);
166 exit(1);
167 }
168
169 if (tok >= MY_MAX_TOKEN) {
170 fprintf(stderr,
171 "%s:%d: Added that many new keywords ? Increase MY_MAX_TOKEN\n",
172 __FILE__, line);
173 exit(1);
174 }
175
176 if (tok > max_seen) {
177 max_seen = tok;
178 }
179
180 compiled_token_array[tok].m_initialized = true;
181 compiled_token_array[tok].m_token_string = str;
182 compiled_token_array[tok].m_token_length = strlen(str);
183 compiled_token_array[tok].m_append_space = true;
184 compiled_token_array[tok].m_start_expr = false;
185 }
186
add_tokenrange187 int add_token(const char *str, int line) {
188 set_token(max_seen ? max_seen + 1 : start, str, line);
189 return max_seen;
190 }
191
printrange192 void print(const char *header1, const char *header2 = nullptr) const {
193 puts(header1);
194 for (int tok = start; tok <= max_seen; tok++) {
195 print_token(tok);
196 }
197
198 if (header2 == nullptr) {
199 return;
200 }
201
202 puts(header2);
203 for (int tok = max_seen + 1; tok <= end; tok++) {
204 printf("/* reserved %03d for %s */ { \"\", 0, false, false},\n", tok,
205 title);
206 }
207 }
208
209 private:
print_tokenrange210 void print_token(int tok) const {
211 const gen_lex_token_string *x = &compiled_token_array[tok];
212 if (tok < 256) {
213 printf("/* %03d */ { \"\\x%02x\", 1, %s, %s},\n", tok, tok,
214 x->m_append_space ? "true" : "false",
215 x->m_start_expr ? "true" : "false");
216 return;
217 }
218
219 if (!x->m_initialized) {
220 static const gen_lex_token_string dummy{"(unknown)", 9, true, false};
221 x = &dummy;
222 }
223 printf("/* %03d */ { \"%s\", %d, %s, %s},\n", tok, x->m_token_string,
224 x->m_token_length, x->m_append_space ? "true" : "false",
225 x->m_start_expr ? "true" : "false");
226 }
227
228 private:
229 const char *const title;
230
231 public:
232 const int start;
233
234 private:
235 const int end;
236
237 int max_seen;
238 };
239
240 static_assert(YYUNDEF == 1150,
241 "YYUNDEF must be stable, because raw token numbers are used in "
242 "PFS digest calculations");
243 range range_for_sql_yacc2{"sql/sql_yacc.yy (before YYUNDEF)", YYUNDEF,
244 MY_MAX_TOKEN};
245
246 range range_for_digests{"digest specials", 1100, range_for_sql_yacc2.start - 1};
247
248 static_assert(MAX_EXECUTION_TIME_HINT == 1000,
249 "MAX_EXECUTION_TIME_HINT should be equal to 1000");
250 range range_for_sql_hints{"sql/sql_hints.yy", MAX_EXECUTION_TIME_HINT,
251 range_for_digests.start - 1};
252
253 range range_for_sql_yacc1{"sql/sql_yacc.yy (after YYUNDEF)", 256,
254 range_for_sql_hints.start - 1};
255
256 int tok_generic_value = 0;
257 int tok_generic_value_list = 0;
258 int tok_row_single_value = 0;
259 int tok_row_single_value_list = 0;
260 int tok_row_multiple_value = 0;
261 int tok_row_multiple_value_list = 0;
262 int tok_in_generic_value_expression = 0;
263 int tok_ident = 0;
264 int tok_ident_at = 0; ///< Fake token for the left part of table\@query_block.
265 int tok_hint_comment_open =
266 0; ///< Fake token value for "/*+" of hint comments.
267 int tok_hint_comment_close =
268 0; ///< Fake token value for "*/" of hint comments.
269 int tok_unused = 0;
270
set_start_expr_token(int tok)271 static void set_start_expr_token(int tok) {
272 compiled_token_array[tok].m_start_expr = true;
273 }
274
compute_tokens()275 static void compute_tokens() {
276 /*
277 Tokens made of just one terminal character
278 */
279
280 // Do nothing -- see range::print() for token numbers in [0 .. 255]
281
282 /*
283 Tokens hard coded in sql_lex.cc
284 */
285
286 range_for_sql_yacc1.set_token(WITH_ROLLUP_SYM, "WITH ROLLUP", __LINE__);
287 range_for_sql_yacc1.set_token(NOT2_SYM, "!", __LINE__);
288 range_for_sql_yacc1.set_token(OR2_SYM, "||", __LINE__);
289 range_for_sql_yacc1.set_token(PARAM_MARKER, "?", __LINE__);
290 range_for_sql_yacc1.set_token(SET_VAR, ":=", __LINE__);
291 range_for_sql_yacc1.set_token(UNDERSCORE_CHARSET, "(_charset)", __LINE__);
292 range_for_sql_yacc1.set_token(END_OF_INPUT, "", __LINE__);
293 range_for_sql_yacc1.set_token(JSON_SEPARATOR_SYM, "->", __LINE__);
294 range_for_sql_yacc1.set_token(JSON_UNQUOTED_SEPARATOR_SYM, "->>", __LINE__);
295
296 /*
297 Values.
298 These tokens are all normalized later,
299 so this strings will never be displayed.
300 */
301 range_for_sql_yacc1.set_token(BIN_NUM, "(bin)", __LINE__);
302 range_for_sql_yacc1.set_token(DECIMAL_NUM, "(decimal)", __LINE__);
303 range_for_sql_yacc1.set_token(FLOAT_NUM, "(float)", __LINE__);
304 range_for_sql_yacc1.set_token(HEX_NUM, "(hex)", __LINE__);
305 range_for_sql_yacc1.set_token(LEX_HOSTNAME, "(hostname)", __LINE__);
306 range_for_sql_yacc1.set_token(LONG_NUM, "(long)", __LINE__);
307 range_for_sql_yacc1.set_token(NUM, "(num)", __LINE__);
308 range_for_sql_yacc1.set_token(TEXT_STRING, "(text)", __LINE__);
309 range_for_sql_yacc1.set_token(NCHAR_STRING, "(nchar)", __LINE__);
310 range_for_sql_yacc1.set_token(ULONGLONG_NUM, "(ulonglong)", __LINE__);
311
312 /*
313 Identifiers.
314 */
315 range_for_sql_yacc1.set_token(IDENT, "(id)", __LINE__);
316 range_for_sql_yacc1.set_token(IDENT_QUOTED, "(id_quoted)", __LINE__);
317
318 /*
319 See symbols[] in sql/lex.h
320 */
321 for (const SYMBOL &sym : symbols) {
322 if ((sym.group & SG_MAIN_PARSER) != 0) {
323 if (sym.tok < YYUNDEF)
324 range_for_sql_yacc1.set_token(sym.tok, sym.name, __LINE__);
325 else
326 range_for_sql_yacc2.set_token(sym.tok, sym.name, __LINE__);
327 } else if ((sym.group & SG_HINTS) != 0) {
328 range_for_sql_hints.set_token(sym.tok, sym.name, __LINE__);
329 } else {
330 fprintf(stderr, "%s:%d: Unknown symbol group flag: %x\n", __FILE__,
331 __LINE__, sym.group & ~(SG_MAIN_PARSER | SG_HINTS));
332 exit(1);
333 }
334 }
335
336 /*
337 Additional FAKE tokens,
338 used internally to normalize a digest text.
339 */
340
341 /* Digest tokens in 5.7 */
342
343 tok_generic_value = range_for_digests.add_token("?", __LINE__);
344 tok_generic_value_list = range_for_digests.add_token("?, ...", __LINE__);
345 tok_row_single_value = range_for_digests.add_token("(?)", __LINE__);
346 tok_row_single_value_list =
347 range_for_digests.add_token("(?) /* , ... */", __LINE__);
348 tok_row_multiple_value = range_for_digests.add_token("(...)", __LINE__);
349 tok_row_multiple_value_list =
350 range_for_digests.add_token("(...) /* , ... */", __LINE__);
351 tok_ident = range_for_digests.add_token("(tok_id)", __LINE__);
352 tok_ident_at = range_for_digests.add_token("(tok_id_at)", __LINE__);
353 tok_hint_comment_open =
354 range_for_digests.add_token(HINT_COMMENT_STARTER, __LINE__);
355 tok_hint_comment_close =
356 range_for_digests.add_token(HINT_COMMENT_TERMINATOR, __LINE__);
357
358 /* New in 8.0 */
359
360 tok_in_generic_value_expression =
361 range_for_digests.add_token("IN (...)", __LINE__);
362
363 /* Add new digest tokens here */
364
365 tok_unused = range_for_digests.add_token("UNUSED", __LINE__);
366
367 /*
368 Fix whitespace for some special tokens.
369 */
370
371 /*
372 The lexer parses "@@variable" as '@', '@', 'variable',
373 returning a token for '@' alone.
374
375 This is incorrect, '@' is not really a token,
376 because the syntax "@ @ variable" (with spaces) is not accepted:
377 The lexer keeps some internal state after the '@' fake token.
378
379 To work around this, digest text are printed as "@@variable".
380 */
381 compiled_token_array[(int)'@'].m_append_space = false;
382
383 /*
384 Define additional properties for tokens.
385
386 List all the token that are followed by an expression.
387 This is needed to differentiate unary from binary
388 '+' and '-' operators, because we want to:
389 - reduce <unary +> <NUM> to <?>,
390 - preserve <...> <binary +> <NUM> as is.
391 */
392 set_start_expr_token('(');
393 set_start_expr_token(',');
394 set_start_expr_token(EVERY_SYM);
395 set_start_expr_token(AT_SYM);
396 set_start_expr_token(STARTS_SYM);
397 set_start_expr_token(ENDS_SYM);
398 set_start_expr_token(DEFAULT_SYM);
399 set_start_expr_token(RETURN_SYM);
400 set_start_expr_token(IF);
401 set_start_expr_token(ELSEIF_SYM);
402 set_start_expr_token(CASE_SYM);
403 set_start_expr_token(WHEN_SYM);
404 set_start_expr_token(WHILE_SYM);
405 set_start_expr_token(UNTIL_SYM);
406 set_start_expr_token(SELECT_SYM);
407
408 set_start_expr_token(OR_SYM);
409 set_start_expr_token(OR2_SYM);
410 set_start_expr_token(XOR);
411 set_start_expr_token(AND_SYM);
412 set_start_expr_token(AND_AND_SYM);
413 set_start_expr_token(NOT_SYM);
414 set_start_expr_token(BETWEEN_SYM);
415 set_start_expr_token(LIKE);
416 set_start_expr_token(REGEXP);
417
418 set_start_expr_token('|');
419 set_start_expr_token('&');
420 set_start_expr_token(SHIFT_LEFT);
421 set_start_expr_token(SHIFT_RIGHT);
422 set_start_expr_token('+');
423 set_start_expr_token('-');
424 set_start_expr_token(INTERVAL_SYM);
425 set_start_expr_token('*');
426 set_start_expr_token('/');
427 set_start_expr_token('%');
428 set_start_expr_token(DIV_SYM);
429 set_start_expr_token(MOD_SYM);
430 set_start_expr_token('^');
431 }
432
print_tokens()433 static void print_tokens() {
434 int tok;
435
436 printf("#ifdef LEX_TOKEN_WITH_DEFINITION\n");
437 printf("lex_token_string lex_token_array[]=\n");
438 printf("{\n");
439 printf("/* PART 1: character tokens. */\n");
440
441 for (tok = 0; tok < 256; tok++) {
442 printf("/* %03d */ { \"\\x%02x\", 1, %s, %s},\n", tok, tok,
443 compiled_token_array[tok].m_append_space ? "true" : "false",
444 compiled_token_array[tok].m_start_expr ? "true" : "false");
445 }
446
447 range_for_sql_yacc1.print(
448 "/* PART 2: named tokens from sql/sql_yacc.yy (chunk 1). */",
449 "/* PART 3: padding reserved for sql/sql_yacc.yy extensions. */");
450
451 range_for_sql_hints.print(
452 "/* PART 4: named tokens from sql/sql_hints.yy. */",
453 "/* PART 5: padding reserved for sql/sql_hints.yy extensions. */");
454
455 range_for_digests.print(
456 "/* PART 6: Digest special tokens. */",
457 "/* PART 7: padding reserved for digest special tokens. */");
458
459 range_for_sql_yacc2.print(
460 "/* PART 8: named tokens from sql/sql_yacc.yy (chunk 2). */");
461
462 printf("/* PART 9: End of token list. */\n");
463
464 printf("/* DUMMY */ { \"\", 0, false, false}\n");
465 printf("};\n");
466 printf("#endif /* LEX_TOKEN_WITH_DEFINITION */\n");
467
468 printf("/* DIGEST specific tokens. */\n");
469 printf("#define TOK_GENERIC_VALUE %d\n", tok_generic_value);
470 printf("#define TOK_GENERIC_VALUE_LIST %d\n", tok_generic_value_list);
471 printf("#define TOK_ROW_SINGLE_VALUE %d\n", tok_row_single_value);
472 printf("#define TOK_ROW_SINGLE_VALUE_LIST %d\n", tok_row_single_value_list);
473 printf("#define TOK_ROW_MULTIPLE_VALUE %d\n", tok_row_multiple_value);
474 printf("#define TOK_ROW_MULTIPLE_VALUE_LIST %d\n",
475 tok_row_multiple_value_list);
476 printf("#define TOK_IDENT %d\n", tok_ident);
477 printf("#define TOK_IDENT_AT %d\n", tok_ident_at);
478 printf("#define TOK_HINT_COMMENT_OPEN %d\n", tok_hint_comment_open);
479 printf("#define TOK_HINT_COMMENT_CLOSE %d\n", tok_hint_comment_close);
480 printf("#define TOK_IN_GENERIC_VALUE_EXPRESSION %d\n",
481 tok_in_generic_value_expression);
482 printf("#define TOK_UNUSED %d\n", tok_unused);
483 }
484
485 /*
486 ZEROFILL_SYM is the last token in the MySQL 5.7 token list,
487 see sql/sql_yacc.yy
488 The token value is frozen and should not change,
489 to avoid changing query digest values.
490 */
491 static const int zerofill_expected_value = 906;
492
493 static_assert(!(ZEROFILL_SYM < zerofill_expected_value),
494 "Token deleted. "
495 "Please read MAINTAINER instructions in sql/sql_yacc.yy");
496 static_assert(!(ZEROFILL_SYM > zerofill_expected_value),
497 "Token added in the wrong place. "
498 "Please read MAINTAINER instructions in sql/sql_yacc.yy");
499
main(int,char **)500 int main(int, char **) {
501 puts(ORACLE_GPL_COPYRIGHT_NOTICE("2016"));
502
503 printf("/*\n");
504 printf(" This file is generated, do not edit.\n");
505 printf(" See file sql/gen_lex_token.cc.\n");
506 printf("*/\n");
507 printf("struct lex_token_string\n");
508 printf("{\n");
509 printf(" const char *m_token_string;\n");
510 printf(" int m_token_length;\n");
511 printf(" bool m_append_space;\n");
512 printf(" bool m_start_expr;\n");
513 printf("};\n");
514 printf("typedef struct lex_token_string lex_token_string;\n");
515
516 compute_tokens();
517 print_tokens();
518
519 return 0;
520 }
521