1 /* Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
2    Copyright (c) 2017, MariaDB Corporation.
3 
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; version 2 of the License.
7 
8   This program is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   GNU General Public License for more details.
12 
13   You should have received a copy of the GNU General Public License
14   along with this program; if not, write to the Free Software Foundation,
15   51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
16 
17 /*
18   This code needs extra visibility in the lexer structures
19 */
20 
21 #include "mariadb.h"
22 #include "my_md5.h"
23 #include "unireg.h"
24 
25 #include "sql_string.h"
26 #include "sql_class.h"
27 #include "sql_lex.h"
28 #include "sp_pcontext.h"
29 #include "sql_digest.h"
30 #include "sql_digest_stream.h"
31 
32 #include "sql_get_diagnostics.h"
33 
34 /* Generated code */
35 #include "yy_mariadb.hh"
36 #define LEX_TOKEN_WITH_DEFINITION
37 #include "lex_token.h"
38 
39 /* Name pollution from sql/sql_lex.h */
40 #ifdef LEX_YYSTYPE
41 #undef LEX_YYSTYPE
42 #endif
43 
44 #define LEX_YYSTYPE YYSTYPE*
45 
46 #define SIZE_OF_A_TOKEN 2
47 
48 /**
49   Read a single token from token array.
50 */
read_token(const sql_digest_storage * digest_storage,uint index,uint * tok)51 inline uint read_token(const sql_digest_storage *digest_storage,
52                        uint index, uint *tok)
53 {
54   uint safe_byte_count= digest_storage->m_byte_count;
55 
56   if (index + SIZE_OF_A_TOKEN <= safe_byte_count &&
57       safe_byte_count <= digest_storage->m_token_array_length)
58   {
59     const unsigned char *src= & digest_storage->m_token_array[index];
60     *tok= src[0] | (src[1] << 8);
61     return index + SIZE_OF_A_TOKEN;
62   }
63 
64   /* The input byte stream is exhausted. */
65   *tok= 0;
66   return MAX_DIGEST_STORAGE_SIZE + 1;
67 }
68 
69 /**
70   Store a single token in token array.
71 */
store_token(sql_digest_storage * digest_storage,uint token)72 inline void store_token(sql_digest_storage* digest_storage, uint token)
73 {
74   DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
75 
76   if (digest_storage->m_byte_count + SIZE_OF_A_TOKEN <= digest_storage->m_token_array_length)
77   {
78     unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
79     dest[0]= token & 0xff;
80     dest[1]= (token >> 8) & 0xff;
81     digest_storage->m_byte_count+= SIZE_OF_A_TOKEN;
82   }
83   else
84   {
85     digest_storage->m_full= true;
86   }
87 }
88 
89 /**
90   Read an identifier from token array.
91 */
read_identifier(const sql_digest_storage * digest_storage,uint index,char ** id_string,int * id_length)92 inline uint read_identifier(const sql_digest_storage* digest_storage,
93                             uint index, char ** id_string, int *id_length)
94 {
95   uint new_index;
96   uint safe_byte_count= digest_storage->m_byte_count;
97 
98   DBUG_ASSERT(index <= safe_byte_count);
99   DBUG_ASSERT(safe_byte_count <= digest_storage->m_token_array_length);
100 
101   /*
102     token + length + string are written in an atomic way,
103     so we do always expect a length + string here
104   */
105 
106   uint bytes_needed= SIZE_OF_A_TOKEN;
107   /* If we can read token and identifier length */
108   if ((index + bytes_needed) <= safe_byte_count)
109   {
110     const unsigned char *src= & digest_storage->m_token_array[index];
111     /* Read the length of identifier */
112     uint length= src[0] | (src[1] << 8);
113     bytes_needed+= length;
114     /* If we can read entire identifier from token array */
115     if ((index + bytes_needed) <= safe_byte_count)
116     {
117       *id_string= (char *) (src + 2);
118       *id_length= length;
119 
120       new_index= index + bytes_needed;
121       DBUG_ASSERT(new_index <= safe_byte_count);
122       return new_index;
123     }
124   }
125 
126   /* The input byte stream is exhausted. */
127   return MAX_DIGEST_STORAGE_SIZE + 1;
128 }
129 
130 /**
131   Store an identifier in token array.
132 */
store_token_identifier(sql_digest_storage * digest_storage,uint token,size_t id_length,const char * id_name)133 inline void store_token_identifier(sql_digest_storage* digest_storage,
134                                    uint token,
135                                    size_t id_length, const char *id_name)
136 {
137   DBUG_ASSERT(digest_storage->m_byte_count <= digest_storage->m_token_array_length);
138 
139   size_t bytes_needed= 2 * SIZE_OF_A_TOKEN + id_length;
140   if (digest_storage->m_byte_count + bytes_needed <= (unsigned int)digest_storage->m_token_array_length)
141   {
142     unsigned char* dest= & digest_storage->m_token_array[digest_storage->m_byte_count];
143     /* Write the token */
144     dest[0]= token & 0xff;
145     dest[1]= (token >> 8) & 0xff;
146     /* Write the string length */
147     dest[2]= id_length & 0xff;
148     dest[3]= (id_length >> 8) & 0xff;
149     /* Write the string data */
150     if (id_length > 0)
151       memcpy((char *)(dest + 4), id_name, id_length);
152     digest_storage->m_byte_count+= (uint)bytes_needed;
153   }
154   else
155   {
156     digest_storage->m_full= true;
157   }
158 }
159 
compute_digest_md5(const sql_digest_storage * digest_storage,unsigned char * md5)160 void compute_digest_md5(const sql_digest_storage *digest_storage, unsigned char *md5)
161 {
162   compute_md5_hash(md5,
163                    (const char *) digest_storage->m_token_array,
164                    digest_storage->m_byte_count);
165 }
166 
167 /*
168   Iterate token array and updates digest_text.
169 */
compute_digest_text(const sql_digest_storage * digest_storage,String * digest_text)170 void compute_digest_text(const sql_digest_storage* digest_storage,
171                          String *digest_text)
172 {
173   DBUG_ASSERT(digest_storage != NULL);
174   uint byte_count= digest_storage->m_byte_count;
175   String *digest_output= digest_text;
176   uint tok= 0;
177   uint current_byte= 0;
178   lex_token_string *tok_data;
179 
180   /* Reset existing data */
181   digest_output->length(0);
182 
183   if (byte_count > digest_storage->m_token_array_length)
184   {
185     digest_output->append("\0", 1);
186     return;
187   }
188 
189   /* Convert text to utf8 */
190   const CHARSET_INFO *from_cs= get_charset(digest_storage->m_charset_number, MYF(0));
191   const CHARSET_INFO *to_cs= &my_charset_utf8mb3_bin;
192 
193   if (from_cs == NULL)
194   {
195     /*
196       Can happen, as we do dirty reads on digest_storage,
197       which can be written to in another thread.
198     */
199     digest_output->append("\0", 1);
200     return;
201   }
202 
203   char id_buffer[NAME_LEN + 1]= {'\0'};
204   char *id_string;
205   size_t id_length;
206   bool convert_text= !my_charset_same(from_cs, to_cs);
207 
208   while (current_byte < byte_count)
209   {
210     current_byte= read_token(digest_storage, current_byte, &tok);
211 
212     if (tok <= 0 || tok >= array_elements(lex_token_array)
213         || current_byte > max_digest_length)
214       return;
215 
216     tok_data= &lex_token_array[tok];
217 
218     switch (tok)
219     {
220     /* All identifiers are printed with their name. */
221     case IDENT:
222     case IDENT_QUOTED:
223     case TOK_IDENT:
224       {
225         char *id_ptr= NULL;
226         int id_len= 0;
227         uint err_cs= 0;
228 
229         /* Get the next identifier from the storage buffer. */
230         current_byte= read_identifier(digest_storage, current_byte,
231                                       &id_ptr, &id_len);
232         if (current_byte > max_digest_length)
233           return;
234 
235         if (convert_text)
236         {
237           /* Verify that the converted text will fit. */
238           if (to_cs->mbmaxlen*id_len > NAME_LEN)
239           {
240             digest_output->append("...", 3);
241             break;
242           }
243           /* Convert identifier string into the storage character set. */
244           id_length= my_convert(id_buffer, NAME_LEN, to_cs,
245                                 id_ptr, id_len, from_cs, &err_cs);
246           id_string= id_buffer;
247         }
248         else
249         {
250           id_string= id_ptr;
251           id_length= id_len;
252         }
253 
254         if (id_length == 0 || err_cs != 0)
255         {
256           break;
257         }
258         /* Copy the converted identifier into the digest string. */
259         digest_output->append("`", 1);
260         if (id_length > 0)
261           digest_output->append(id_string, id_length);
262         digest_output->append("` ", 2);
263       }
264       break;
265 
266     /* Everything else is printed as is. */
267     default:
268       /*
269         Make sure not to overflow digest_text buffer.
270         +1 is to make sure extra space for ' '.
271       */
272       int tok_length= tok_data->m_token_length;
273 
274       digest_output->append(tok_data->m_token_string, tok_length);
275       if (tok_data->m_append_space)
276         digest_output->append(" ", 1);
277       break;
278     }
279   }
280 }
281 
peek_token(const sql_digest_storage * digest,uint index)282 static inline uint peek_token(const sql_digest_storage *digest, uint index)
283 {
284   uint token;
285   DBUG_ASSERT(index + SIZE_OF_A_TOKEN <= digest->m_byte_count);
286   DBUG_ASSERT(digest->m_byte_count <=  digest->m_token_array_length);
287 
288   token= ((digest->m_token_array[index + 1])<<8) | digest->m_token_array[index];
289   return token;
290 }
291 
292 /**
293   Function to read last two tokens from token array. If an identifier
294   is found, do not look for token before that.
295 */
peek_last_two_tokens(const sql_digest_storage * digest_storage,uint last_id_index,uint * t1,uint * t2)296 static inline void peek_last_two_tokens(const sql_digest_storage* digest_storage,
297                                         uint last_id_index, uint *t1, uint *t2)
298 {
299   uint byte_count= digest_storage->m_byte_count;
300   uint peek_index= byte_count;
301 
302   if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
303   {
304     /* Take last token. */
305     peek_index-= SIZE_OF_A_TOKEN;
306     *t1= peek_token(digest_storage, peek_index);
307 
308     if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
309     {
310       /* Take 2nd token from last. */
311       peek_index-= SIZE_OF_A_TOKEN;
312       *t2= peek_token(digest_storage, peek_index);
313     }
314     else
315     {
316       *t2= TOK_UNUSED;
317     }
318   }
319   else
320   {
321     *t1= TOK_UNUSED;
322     *t2= TOK_UNUSED;
323   }
324 }
325 
326 /**
327   Function to read last three tokens from token array. If an identifier
328   is found, do not look for token before that.
329 */
peek_last_three_tokens(const sql_digest_storage * digest_storage,uint last_id_index,uint * t1,uint * t2,uint * t3)330 static inline void peek_last_three_tokens(const sql_digest_storage* digest_storage,
331                                           uint last_id_index, uint *t1, uint *t2, uint *t3)
332 {
333   uint byte_count= digest_storage->m_byte_count;
334   uint peek_index= byte_count;
335 
336   if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
337   {
338     /* Take last token. */
339     peek_index-= SIZE_OF_A_TOKEN;
340     *t1= peek_token(digest_storage, peek_index);
341 
342     if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
343     {
344       /* Take 2nd token from last. */
345       peek_index-= SIZE_OF_A_TOKEN;
346       *t2= peek_token(digest_storage, peek_index);
347 
348       if (last_id_index + SIZE_OF_A_TOKEN <= peek_index)
349       {
350         /* Take 3rd token from last. */
351         peek_index-= SIZE_OF_A_TOKEN;
352         *t3= peek_token(digest_storage, peek_index);
353       }
354       else
355       {
356         *t3= TOK_UNUSED;
357       }
358     }
359     else
360     {
361       *t2= TOK_UNUSED;
362       *t3= TOK_UNUSED;
363     }
364   }
365   else
366   {
367     *t1= TOK_UNUSED;
368     *t2= TOK_UNUSED;
369     *t3= TOK_UNUSED;
370   }
371 }
372 
digest_add_token(sql_digest_state * state,uint token,LEX_YYSTYPE yylval)373 sql_digest_state* digest_add_token(sql_digest_state *state,
374                                    uint token,
375                                    LEX_YYSTYPE yylval)
376 {
377   sql_digest_storage *digest_storage= NULL;
378 
379   digest_storage= &state->m_digest_storage;
380 
381   /*
382     Stop collecting further tokens if digest storage is full or
383     if END token is received.
384   */
385   if (digest_storage->m_full || token == END_OF_INPUT)
386     return NULL;
387 
388   /*
389     Take last_token 2 tokens collected till now. These tokens will be used
390     in reduce for normalisation. Make sure not to consider ID tokens in reduce.
391   */
392   uint last_token;
393   uint last_token2;
394 
395   switch (token)
396   {
397     case NUM:
398     case LONG_NUM:
399     case ULONGLONG_NUM:
400     case DECIMAL_NUM:
401     case FLOAT_NUM:
402     case BIN_NUM:
403     case HEX_NUM:
404     {
405       bool found_unary;
406       do
407       {
408         found_unary= false;
409         peek_last_two_tokens(digest_storage, state->m_last_id_index,
410                              &last_token, &last_token2);
411 
412         if ((last_token == '-') || (last_token == '+'))
413         {
414           /*
415             We need to differentiate:
416             - a <unary minus> operator
417             - a <unary plus> operator
418             from
419             - a <binary minus> operator
420             - a <binary plus> operator
421             to only reduce "a = -1" to "a = ?", and not change "b - 1" to "b ?"
422 
423             Binary operators are found inside an expression,
424             while unary operators are found at the beginning of an expression, or after operators.
425 
426             To achieve this, every token that is followed by an <expr> expression
427             in the SQL grammar is flagged.
428             See sql/sql_yacc.yy
429             See sql/gen_lex_token.cc
430 
431             For example,
432             "(-1)" is parsed as "(", "-", NUM, ")", and lex_token_array["("].m_start_expr is true,
433             so reduction of the "-" NUM is done, the result is "(?)".
434             "(a-1)" is parsed as "(", ID, "-", NUM, ")", and lex_token_array[ID].m_start_expr is false,
435             so the operator is binary, no reduction is done, and the result is "(a-?)".
436           */
437           if (lex_token_array[last_token2].m_start_expr)
438           {
439             /*
440               REDUCE:
441               TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) (NUM | LOG_NUM | ... | FLOAT_NUM)
442 
443               REDUCE:
444               TOK_GENERIC_VALUE := (UNARY_PLUS | UNARY_MINUS) TOK_GENERIC_VALUE
445             */
446             token= TOK_GENERIC_VALUE;
447             digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
448             found_unary= true;
449           }
450         }
451       } while (found_unary);
452     }
453     /* for case NULL_SYM below */
454     /* fall through */
455     case LEX_HOSTNAME:
456     case TEXT_STRING:
457     case NCHAR_STRING:
458     case PARAM_MARKER:
459     {
460       /*
461         REDUCE:
462         TOK_GENERIC_VALUE := BIN_NUM | DECIMAL_NUM | ... | ULONGLONG_NUM
463       */
464       token= TOK_GENERIC_VALUE;
465 
466       peek_last_two_tokens(digest_storage, state->m_last_id_index,
467                            &last_token, &last_token2);
468 
469       if ((last_token2 == TOK_GENERIC_VALUE ||
470            last_token2 == TOK_GENERIC_VALUE_LIST) &&
471           (last_token == ','))
472       {
473         /*
474           REDUCE:
475           TOK_GENERIC_VALUE_LIST :=
476             TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
477 
478           REDUCE:
479           TOK_GENERIC_VALUE_LIST :=
480             TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
481         */
482         digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
483         token= TOK_GENERIC_VALUE_LIST;
484       }
485       /*
486         Add this token or the resulting reduce to digest storage.
487       */
488       store_token(digest_storage, token);
489       break;
490     }
491     case ')':
492     {
493       peek_last_two_tokens(digest_storage, state->m_last_id_index,
494                            &last_token, &last_token2);
495 
496       if (last_token == TOK_GENERIC_VALUE &&
497           last_token2 == '(')
498       {
499         /*
500           REDUCE:
501           TOK_ROW_SINGLE_VALUE :=
502             '(' TOK_GENERIC_VALUE ')'
503         */
504         digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
505         token= TOK_ROW_SINGLE_VALUE;
506 
507         /* Read last two tokens again */
508         peek_last_two_tokens(digest_storage, state->m_last_id_index,
509                              &last_token, &last_token2);
510 
511         if ((last_token2 == TOK_ROW_SINGLE_VALUE ||
512              last_token2 == TOK_ROW_SINGLE_VALUE_LIST) &&
513             (last_token == ','))
514         {
515           /*
516             REDUCE:
517             TOK_ROW_SINGLE_VALUE_LIST :=
518               TOK_ROW_SINGLE_VALUE ',' TOK_ROW_SINGLE_VALUE
519 
520             REDUCE:
521             TOK_ROW_SINGLE_VALUE_LIST :=
522               TOK_ROW_SINGLE_VALUE_LIST ',' TOK_ROW_SINGLE_VALUE
523           */
524           digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
525           token= TOK_ROW_SINGLE_VALUE_LIST;
526         }
527       }
528       else if (last_token == TOK_GENERIC_VALUE_LIST &&
529                last_token2 == '(')
530       {
531         /*
532           REDUCE:
533           TOK_ROW_MULTIPLE_VALUE :=
534             '(' TOK_GENERIC_VALUE_LIST ')'
535         */
536         digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
537         token= TOK_ROW_MULTIPLE_VALUE;
538 
539         /* Read last two tokens again */
540         peek_last_two_tokens(digest_storage, state->m_last_id_index,
541                              &last_token, &last_token2);
542 
543         if ((last_token2 == TOK_ROW_MULTIPLE_VALUE ||
544              last_token2 == TOK_ROW_MULTIPLE_VALUE_LIST) &&
545             (last_token == ','))
546         {
547           /*
548             REDUCE:
549             TOK_ROW_MULTIPLE_VALUE_LIST :=
550               TOK_ROW_MULTIPLE_VALUE ',' TOK_ROW_MULTIPLE_VALUE
551 
552             REDUCE:
553             TOK_ROW_MULTIPLE_VALUE_LIST :=
554               TOK_ROW_MULTIPLE_VALUE_LIST ',' TOK_ROW_MULTIPLE_VALUE
555           */
556           digest_storage->m_byte_count-= 2*SIZE_OF_A_TOKEN;
557           token= TOK_ROW_MULTIPLE_VALUE_LIST;
558         }
559       }
560       /*
561         Add this token or the resulting reduce to digest storage.
562       */
563       store_token(digest_storage, token);
564       break;
565     }
566     case IDENT:
567     case IDENT_QUOTED:
568     {
569       YYSTYPE *lex_token= yylval;
570       const char *yytext= lex_token->lex_str.str;
571       size_t yylen= lex_token->lex_str.length;
572 
573       /*
574         REDUCE:
575           TOK_IDENT := IDENT | IDENT_QUOTED
576         The parser gives IDENT or IDENT_TOKEN for the same text,
577         depending on the character set used.
578         We unify both to always print the same digest text,
579         and always have the same digest hash.
580       */
581       token= TOK_IDENT;
582       /* Add this token and identifier string to digest storage. */
583       store_token_identifier(digest_storage, token, yylen, yytext);
584 
585       /* Update the index of last identifier found. */
586       state->m_last_id_index= digest_storage->m_byte_count;
587       break;
588     }
589     default:
590     {
591       /* Add this token to digest storage. */
592       store_token(digest_storage, token);
593       break;
594     }
595   }
596 
597   return state;
598 }
599 
digest_reduce_token(sql_digest_state * state,uint token_left,uint token_right)600 sql_digest_state* digest_reduce_token(sql_digest_state *state,
601                                       uint token_left, uint token_right)
602 {
603   sql_digest_storage *digest_storage= NULL;
604 
605   digest_storage= &state->m_digest_storage;
606 
607   /*
608     Stop collecting further tokens if digest storage is full.
609   */
610   if (digest_storage->m_full)
611     return NULL;
612 
613   uint last_token;
614   uint last_token2;
615   uint last_token3;
616   uint token_to_push= TOK_UNUSED;
617 
618   peek_last_two_tokens(digest_storage, state->m_last_id_index,
619                        &last_token, &last_token2);
620 
621   /*
622     There is only one caller of digest_reduce_token(),
623     see sql/sql_yacc.yy, rule literal := NULL_SYM.
624     REDUCE:
625       token_left := token_right
626     Used for:
627       TOK_GENERIC_VALUE := NULL_SYM
628   */
629 
630   if (last_token == token_right)
631   {
632     /*
633       Current stream is like:
634         TOKEN_X TOKEN_RIGHT .
635       REDUCE to
636         TOKEN_X TOKEN_LEFT .
637     */
638     digest_storage->m_byte_count-= SIZE_OF_A_TOKEN;
639     store_token(digest_storage, token_left);
640   }
641   else
642   {
643     /*
644       Current stream is like:
645         TOKEN_X TOKEN_RIGHT TOKEN_Y .
646       Pop TOKEN_Y
647         TOKEN_X TOKEN_RIGHT . TOKEN_Y
648       REDUCE to
649         TOKEN_X TOKEN_LEFT . TOKEN_Y
650     */
651     DBUG_ASSERT(last_token2 == token_right);
652     digest_storage->m_byte_count-= 2 * SIZE_OF_A_TOKEN;
653     store_token(digest_storage, token_left);
654     token_to_push= last_token;
655   }
656 
657   peek_last_three_tokens(digest_storage, state->m_last_id_index,
658                          &last_token, &last_token2, &last_token3);
659 
660   if ((last_token3 == TOK_GENERIC_VALUE ||
661        last_token3 == TOK_GENERIC_VALUE_LIST) &&
662       (last_token2 == ',') &&
663       (last_token == TOK_GENERIC_VALUE))
664   {
665     /*
666       REDUCE:
667       TOK_GENERIC_VALUE_LIST :=
668         TOK_GENERIC_VALUE ',' TOK_GENERIC_VALUE
669 
670       REDUCE:
671       TOK_GENERIC_VALUE_LIST :=
672         TOK_GENERIC_VALUE_LIST ',' TOK_GENERIC_VALUE
673     */
674     digest_storage->m_byte_count-= 3*SIZE_OF_A_TOKEN;
675     store_token(digest_storage, TOK_GENERIC_VALUE_LIST);
676   }
677 
678   if (token_to_push != TOK_UNUSED)
679   {
680     /*
681       Push TOKEN_Y
682     */
683     store_token(digest_storage, token_to_push);
684   }
685 
686   return state;
687 }
688 
689